howard.objects.variants

    1import csv
    2import gc
    3import gzip
    4import io
    5import multiprocessing
    6import os
    7import random
    8import re
    9import shlex
   10import sqlite3
   11import subprocess
   12from tempfile import NamedTemporaryFile, TemporaryDirectory
   13import tempfile
   14import duckdb
   15import json
   16import yaml
   17import argparse
   18import Bio.bgzf as bgzf
   19import pandas as pd
   20from pyfaidx import Fasta
   21import numpy as np
   22import vcf
   23import logging as log
   24import fastparquet as fp
   25from multiprocesspandas import applyparallel
   26
   27from howard.functions.commons import *
   28from howard.objects.database import *
   29from howard.functions.databases import *
   30from howard.functions.utils import *
   31
   32
   33class Variants:
   34
   35    def __init__(
   36        self,
   37        conn=None,
   38        input: str = None,
   39        output: str = None,
   40        config: dict = {},
   41        param: dict = {},
   42        load: bool = False,
   43    ) -> None:
   44        """
   45        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   46        header
   47
   48        :param conn: the connection to the database
   49        :param input: the input file
   50        :param output: the output file
   51        :param config: a dictionary containing the configuration of the model
   52        :param param: a dictionary containing the parameters of the model
   53        """
   54
   55        # Init variables
   56        self.init_variables()
   57
   58        # Input
   59        self.set_input(input)
   60
   61        # Config
   62        self.set_config(config)
   63
   64        # Param
   65        self.set_param(param)
   66
   67        # Output
   68        self.set_output(output)
   69
   70        # connexion
   71        self.set_connexion(conn)
   72
   73        # Header
   74        self.set_header()
   75
   76        # Load data
   77        if load:
   78            self.load_data()
   79
   80    def set_input(self, input: str = None) -> None:
   81        """
   82        The function `set_input` takes a file name as input, extracts the name and extension, and sets
   83        attributes in the class accordingly.
   84
   85        :param input: The `set_input` method in the provided code snippet is used to set attributes
   86        related to the input file. Here's a breakdown of the parameters and their usage in the method:
   87        :type input: str
   88        """
   89
   90        if input and not isinstance(input, str):
   91            try:
   92                self.input = input.name
   93            except:
   94                log.error(f"Input file '{input} in bad format")
   95                raise ValueError(f"Input file '{input} in bad format")
   96        else:
   97            self.input = input
   98
   99        # Input format
  100        if input:
  101            input_name, input_extension = os.path.splitext(self.input)
  102            self.input_name = input_name
  103            self.input_extension = input_extension
  104            self.input_format = self.input_extension.replace(".", "")
  105
  106    def set_config(self, config: dict) -> None:
  107        """
  108        The set_config function takes a config object and assigns it as the configuration object for the
  109        class.
  110
  111        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  112        contains configuration settings for the class. When you call the `set_config` function with a
  113        dictionary object as the argument, it will set that dictionary as the configuration object for
  114        the class
  115        :type config: dict
  116        """
  117
  118        self.config = config
  119
  120    def set_param(self, param: dict) -> None:
  121        """
  122        This function sets a parameter object for the class based on the input dictionary.
  123
  124        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  125        as the `param` attribute of the class instance
  126        :type param: dict
  127        """
  128
  129        self.param = param
  130
  131    def init_variables(self) -> None:
  132        """
  133        This function initializes the variables that will be used in the rest of the class
  134        """
  135
  136        self.prefix = "howard"
  137        self.table_variants = "variants"
  138        self.dataframe = None
  139
  140        self.comparison_map = {
  141            "gt": ">",
  142            "gte": ">=",
  143            "lt": "<",
  144            "lte": "<=",
  145            "equals": "=",
  146            "contains": "SIMILAR TO",
  147        }
  148
  149        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  150
  151        self.code_type_map_to_sql = {
  152            "Integer": "INTEGER",
  153            "String": "VARCHAR",
  154            "Float": "FLOAT",
  155            "Flag": "VARCHAR",
  156        }
  157
  158        self.index_additionnal_fields = []
  159
  160    def get_indexing(self) -> bool:
  161        """
  162        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  163        returns False.
  164        :return: The value of the indexing parameter.
  165        """
  166
  167        return self.get_param().get("indexing", False)
  168
  169    def get_connexion_config(self) -> dict:
  170        """
  171        The function `get_connexion_config` returns a dictionary containing the configuration for a
  172        connection, including the number of threads and memory limit.
  173        :return: a dictionary containing the configuration for the Connexion library.
  174        """
  175
  176        # config
  177        config = self.get_config()
  178
  179        # Connexion config
  180        connexion_config = {}
  181        threads = self.get_threads()
  182
  183        # Threads
  184        if threads:
  185            connexion_config["threads"] = threads
  186
  187        # Memory
  188        # if config.get("memory", None):
  189        #     connexion_config["memory_limit"] = config.get("memory")
  190        if self.get_memory():
  191            connexion_config["memory_limit"] = self.get_memory()
  192
  193        # Temporary directory
  194        if config.get("tmp", None):
  195            connexion_config["temp_directory"] = config.get("tmp")
  196
  197        # Access
  198        if config.get("access", None):
  199            access = config.get("access")
  200            if access in ["RO"]:
  201                access = "READ_ONLY"
  202            elif access in ["RW"]:
  203                access = "READ_WRITE"
  204            connexion_db = self.get_connexion_db()
  205            if connexion_db in ":memory:":
  206                access = "READ_WRITE"
  207            connexion_config["access_mode"] = access
  208
  209        return connexion_config
  210
  211    def get_duckdb_settings(self) -> dict:
  212        """
  213        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  214        string.
  215        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  216        """
  217
  218        # config
  219        config = self.get_config()
  220
  221        # duckdb settings
  222        duckdb_settings_dict = {}
  223        if config.get("duckdb_settings", None):
  224            duckdb_settings = config.get("duckdb_settings")
  225            duckdb_settings = full_path(duckdb_settings)
  226            # duckdb setting is a file
  227            if os.path.exists(duckdb_settings):
  228                with open(duckdb_settings) as json_file:
  229                    duckdb_settings_dict = yaml.safe_load(json_file)
  230            # duckdb settings is a string
  231            else:
  232                duckdb_settings_dict = json.loads(duckdb_settings)
  233
  234        return duckdb_settings_dict
  235
  236    def set_connexion_db(self) -> str:
  237        """
  238        The function `set_connexion_db` returns the appropriate database connection string based on the
  239        input format and connection type.
  240        :return: the value of the variable `connexion_db`.
  241        """
  242
  243        # Default connexion db
  244        default_connexion_db = ":memory:"
  245
  246        # Find connexion db
  247        if self.get_input_format() in ["db", "duckdb"]:
  248            connexion_db = self.get_input()
  249        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  250            connexion_db = default_connexion_db
  251        elif self.get_connexion_type() in ["tmpfile"]:
  252            tmp_name = tempfile.mkdtemp(
  253                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  254            )
  255            connexion_db = f"{tmp_name}/tmp.db"
  256        elif self.get_connexion_type() != "":
  257            connexion_db = self.get_connexion_type()
  258        else:
  259            connexion_db = default_connexion_db
  260
  261        # Set connexion db
  262        self.connexion_db = connexion_db
  263
  264        return connexion_db
  265
  266    def set_connexion(self, conn) -> None:
  267        """
  268        The function `set_connexion` creates a connection to a database, with options for different
  269        database formats and settings.
  270
  271        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  272        database. If a connection is not provided, a new connection to an in-memory database is created.
  273        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  274        sqlite
  275        """
  276
  277        # Connexion db
  278        connexion_db = self.set_connexion_db()
  279
  280        # Connexion config
  281        connexion_config = self.get_connexion_config()
  282
  283        # Connexion format
  284        connexion_format = self.get_config().get("connexion_format", "duckdb")
  285        # Set connexion format
  286        self.connexion_format = connexion_format
  287
  288        # Connexion
  289        if not conn:
  290            if connexion_format in ["duckdb"]:
  291                conn = duckdb.connect(connexion_db, config=connexion_config)
  292                # duckDB settings
  293                duckdb_settings = self.get_duckdb_settings()
  294                if duckdb_settings:
  295                    for setting in duckdb_settings:
  296                        setting_value = duckdb_settings.get(setting)
  297                        if isinstance(setting_value, str):
  298                            setting_value = f"'{setting_value}'"
  299                        conn.execute(f"PRAGMA {setting}={setting_value};")
  300            elif connexion_format in ["sqlite"]:
  301                conn = sqlite3.connect(connexion_db)
  302
  303        # Set connexion
  304        self.conn = conn
  305
  306        # Log
  307        log.debug(f"connexion_format: {connexion_format}")
  308        log.debug(f"connexion_db: {connexion_db}")
  309        log.debug(f"connexion config: {connexion_config}")
  310        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  311
  312    def set_output(self, output: str = None) -> None:
  313        """
  314        The `set_output` function in Python sets the output file based on the input or a specified key
  315        in the config file, extracting the output name, extension, and format.
  316
  317        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  318        the output file. If the config file has an 'output' key, the method sets the output to the value
  319        of that key. If no output is provided, it sets the output to `None`
  320        :type output: str
  321        """
  322
  323        if output and not isinstance(output, str):
  324            self.output = output.name
  325        else:
  326            self.output = output
  327
  328        # Output format
  329        if self.output:
  330            output_name, output_extension = os.path.splitext(self.output)
  331            self.output_name = output_name
  332            self.output_extension = output_extension
  333            self.output_format = self.output_extension.replace(".", "")
  334        else:
  335            self.output_name = None
  336            self.output_extension = None
  337            self.output_format = None
  338
  339    def set_header(self) -> None:
  340        """
  341        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  342        """
  343
  344        input_file = self.get_input()
  345        default_header_list = [
  346            "##fileformat=VCFv4.2",
  347            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  348        ]
  349
  350        # Full path
  351        input_file = full_path(input_file)
  352
  353        if input_file:
  354
  355            input_format = self.get_input_format()
  356            input_compressed = self.get_input_compressed()
  357            config = self.get_config()
  358            header_list = default_header_list
  359            if input_format in [
  360                "vcf",
  361                "hdr",
  362                "tsv",
  363                "csv",
  364                "psv",
  365                "parquet",
  366                "db",
  367                "duckdb",
  368            ]:
  369                # header provided in param
  370                if config.get("header_file", None):
  371                    with open(config.get("header_file"), "rt") as f:
  372                        header_list = self.read_vcf_header(f)
  373                # within a vcf file format (header within input file itsself)
  374                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  375                    # within a compressed vcf file format (.vcf.gz)
  376                    if input_compressed:
  377                        with bgzf.open(input_file, "rt") as f:
  378                            header_list = self.read_vcf_header(f)
  379                    # within an uncompressed vcf file format (.vcf)
  380                    else:
  381                        with open(input_file, "rt") as f:
  382                            header_list = self.read_vcf_header(f)
  383                # header provided in default external file .hdr
  384                elif os.path.exists((input_file + ".hdr")):
  385                    with open(input_file + ".hdr", "rt") as f:
  386                        header_list = self.read_vcf_header(f)
  387                else:
  388                    try:  # Try to get header info fields and file columns
  389
  390                        with tempfile.TemporaryDirectory() as tmpdir:
  391
  392                            # Create database
  393                            db_for_header = Database(database=input_file)
  394
  395                            # Get header columns for infos fields
  396                            db_header_from_columns = (
  397                                db_for_header.get_header_from_columns()
  398                            )
  399
  400                            # Get real columns in the file
  401                            db_header_columns = db_for_header.get_columns()
  402
  403                            # Write header file
  404                            header_file_tmp = os.path.join(tmpdir, "header")
  405                            f = open(header_file_tmp, "w")
  406                            vcf.Writer(f, db_header_from_columns)
  407                            f.close()
  408
  409                            # Replace #CHROM line with rel columns
  410                            header_list = db_for_header.read_header_file(
  411                                header_file=header_file_tmp
  412                            )
  413                            header_list[-1] = "\t".join(db_header_columns)
  414
  415                    except:
  416
  417                        log.warning(
  418                            f"No header for file {input_file}. Set as default VCF header"
  419                        )
  420                        header_list = default_header_list
  421
  422            else:  # try for unknown format ?
  423
  424                log.error(f"Input file format '{input_format}' not available")
  425                raise ValueError(f"Input file format '{input_format}' not available")
  426
  427            if not header_list:
  428                header_list = default_header_list
  429
  430            # header as list
  431            self.header_list = header_list
  432
  433            # header as VCF object
  434            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  435
  436        else:
  437
  438            self.header_list = None
  439            self.header_vcf = None
  440
  441    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  442        """
  443        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  444        DataFrame based on the connection format.
  445
  446        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  447        represents the SQL query you want to execute. This query will be used to fetch data from a
  448        database and convert it into a pandas DataFrame
  449        :type query: str
  450        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  451        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  452        function will only fetch up to that number of rows from the database query result. If no limit
  453        is specified,
  454        :type limit: int
  455        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  456        """
  457
  458        # Connexion format
  459        connexion_format = self.get_connexion_format()
  460
  461        # Limit in query
  462        if limit:
  463            pd.set_option("display.max_rows", limit)
  464            if connexion_format in ["duckdb"]:
  465                df = (
  466                    self.conn.execute(query)
  467                    .fetch_record_batch(limit)
  468                    .read_next_batch()
  469                    .to_pandas()
  470                )
  471            elif connexion_format in ["sqlite"]:
  472                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  473
  474        # Full query
  475        else:
  476            if connexion_format in ["duckdb"]:
  477                df = self.conn.execute(query).df()
  478            elif connexion_format in ["sqlite"]:
  479                df = pd.read_sql_query(query, self.conn)
  480
  481        return df
  482
  483    def get_overview(self) -> None:
  484        """
  485        The function prints the input, output, config, and dataframe of the current object
  486        """
  487        table_variants_from = self.get_table_variants(clause="from")
  488        sql_columns = self.get_header_columns_as_sql()
  489        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  490        df = self.get_query_to_df(sql_query_export)
  491        log.info(
  492            "Input:  "
  493            + str(self.get_input())
  494            + " ["
  495            + str(str(self.get_input_format()))
  496            + "]"
  497        )
  498        log.info(
  499            "Output: "
  500            + str(self.get_output())
  501            + " ["
  502            + str(str(self.get_output_format()))
  503            + "]"
  504        )
  505        log.info("Config: ")
  506        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  507            "\n"
  508        ):
  509            log.info("\t" + str(d))
  510        log.info("Param: ")
  511        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  512            "\n"
  513        ):
  514            log.info("\t" + str(d))
  515        log.info("Sample list: " + str(self.get_header_sample_list()))
  516        log.info("Dataframe: ")
  517        for d in str(df).split("\n"):
  518            log.info("\t" + str(d))
  519
  520        # garbage collector
  521        del df
  522        gc.collect()
  523
  524        return None
  525
  526    def get_stats(self) -> dict:
  527        """
  528        The `get_stats` function calculates and returns various statistics of the current object,
  529        including information about the input file, variants, samples, header fields, quality, and
  530        SNVs/InDels.
  531        :return: a dictionary containing various statistics of the current object. The dictionary has
  532        the following structure:
  533        """
  534
  535        # Log
  536        log.info(f"Stats Calculation...")
  537
  538        # table varaints
  539        table_variants_from = self.get_table_variants()
  540
  541        # stats dict
  542        stats = {"Infos": {}}
  543
  544        ### File
  545        input_file = self.get_input()
  546        stats["Infos"]["Input file"] = input_file
  547
  548        # Header
  549        header_infos = self.get_header().infos
  550        header_formats = self.get_header().formats
  551        header_infos_list = list(header_infos)
  552        header_formats_list = list(header_formats)
  553
  554        ### Variants
  555
  556        stats["Variants"] = {}
  557
  558        # Variants by chr
  559        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  560        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  561        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  562            by=["CHROM"], kind="quicksort"
  563        )
  564
  565        # Total number of variants
  566        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  567
  568        # Calculate percentage
  569        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  570            lambda x: (x / nb_of_variants)
  571        )
  572
  573        stats["Variants"]["Number of variants by chromosome"] = (
  574            nb_of_variants_by_chrom.to_dict(orient="index")
  575        )
  576
  577        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  578
  579        ### Samples
  580
  581        # Init
  582        samples = {}
  583        nb_of_samples = 0
  584
  585        # Check Samples
  586        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  587            log.debug(f"Check samples...")
  588            for sample in self.get_header_sample_list():
  589                sql_query_samples = f"""
  590                    SELECT  '{sample}' as sample,
  591                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  592                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  593                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  594                    FROM {table_variants_from}
  595                    WHERE (
  596                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  597                        AND
  598                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  599                      )
  600                    GROUP BY genotype
  601                    """
  602                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  603                sample_genotype_count = sql_query_genotype_df["count"].sum()
  604                if len(sql_query_genotype_df):
  605                    nb_of_samples += 1
  606                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  607                        sql_query_genotype_df.to_dict(orient="index")
  608                    )
  609
  610            stats["Samples"] = samples
  611            stats["Infos"]["Number of samples"] = nb_of_samples
  612
  613        # #
  614        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  615        #     stats["Infos"]["Number of samples"] = nb_of_samples
  616        # elif nb_of_samples:
  617        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  618
  619        ### INFO and FORMAT fields
  620        header_types_df = {}
  621        header_types_list = {
  622            "List of INFO fields": header_infos,
  623            "List of FORMAT fields": header_formats,
  624        }
  625        i = 0
  626        for header_type in header_types_list:
  627
  628            header_type_infos = header_types_list.get(header_type)
  629            header_infos_dict = {}
  630
  631            for info in header_type_infos:
  632
  633                i += 1
  634                header_infos_dict[i] = {}
  635
  636                # ID
  637                header_infos_dict[i]["id"] = info
  638
  639                # num
  640                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  641                if header_type_infos[info].num in genotype_map.keys():
  642                    header_infos_dict[i]["Number"] = genotype_map.get(
  643                        header_type_infos[info].num
  644                    )
  645                else:
  646                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  647
  648                # type
  649                if header_type_infos[info].type:
  650                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  651                else:
  652                    header_infos_dict[i]["Type"] = "."
  653
  654                # desc
  655                if header_type_infos[info].desc != None:
  656                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  657                else:
  658                    header_infos_dict[i]["Description"] = ""
  659
  660            if len(header_infos_dict):
  661                header_types_df[header_type] = pd.DataFrame.from_dict(
  662                    header_infos_dict, orient="index"
  663                ).to_dict(orient="index")
  664
  665        # Stats
  666        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  667        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  668        stats["Header"] = header_types_df
  669
  670        ### QUAL
  671        if "QUAL" in self.get_header_columns():
  672            sql_query_qual = f"""
  673                    SELECT
  674                        avg(CAST(QUAL AS INTEGER)) AS Average,
  675                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  676                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  677                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  678                        median(CAST(QUAL AS INTEGER)) AS Median,
  679                        variance(CAST(QUAL AS INTEGER)) AS Variance
  680                    FROM {table_variants_from}
  681                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  682                    """
  683
  684            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  685            stats["Quality"] = {"Stats": qual}
  686
  687        ### SNV and InDel
  688
  689        sql_query_snv = f"""
  690            
  691            SELECT Type, count FROM (
  692
  693                    SELECT
  694                        'Total' AS Type,
  695                        count(*) AS count
  696                    FROM {table_variants_from}
  697
  698                    UNION
  699
  700                    SELECT
  701                        'MNV' AS Type,
  702                        count(*) AS count
  703                    FROM {table_variants_from}
  704                    WHERE len(REF) > 1 AND len(ALT) > 1
  705                    AND len(REF) = len(ALT)
  706
  707                    UNION
  708
  709                    SELECT
  710                        'InDel' AS Type,
  711                        count(*) AS count
  712                    FROM {table_variants_from}
  713                    WHERE len(REF) > 1 OR len(ALT) > 1
  714                    AND len(REF) != len(ALT)
  715                    
  716                    UNION
  717
  718                    SELECT
  719                        'SNV' AS Type,
  720                        count(*) AS count
  721                    FROM {table_variants_from}
  722                    WHERE len(REF) = 1 AND len(ALT) = 1
  723
  724                )
  725
  726            ORDER BY count DESC
  727
  728                """
  729        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  730
  731        sql_query_snv_substitution = f"""
  732                SELECT
  733                    concat(REF, '>', ALT) AS 'Substitution',
  734                    count(*) AS count
  735                FROM {table_variants_from}
  736                WHERE len(REF) = 1 AND len(ALT) = 1
  737                GROUP BY REF, ALT
  738                ORDER BY count(*) DESC
  739                """
  740        snv_substitution = (
  741            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  742        )
  743        stats["Variants"]["Counts"] = snv_indel
  744        stats["Variants"]["Substitutions"] = snv_substitution
  745
  746        return stats
  747
  748    def stats_to_file(self, file: str = None) -> str:
  749        """
  750        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  751        into a JSON object, and writes the JSON object to the specified file.
  752
  753        :param file: The `file` parameter is a string that represents the file path where the JSON data
  754        will be written
  755        :type file: str
  756        :return: the name of the file that was written to.
  757        """
  758
  759        # Get stats
  760        stats = self.get_stats()
  761
  762        # Serializing json
  763        json_object = json.dumps(stats, indent=4)
  764
  765        # Writing to sample.json
  766        with open(file, "w") as outfile:
  767            outfile.write(json_object)
  768
  769        return file
  770
  771    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  772        """
  773        The `print_stats` function generates a markdown file and prints the statistics contained in a
  774        JSON file in a formatted manner.
  775
  776        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  777        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  778        provided, a temporary directory will be created and the stats will be saved in a file named
  779        "stats.md" within that
  780        :type output_file: str
  781        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  782        file where the statistics will be saved. If no value is provided, a temporary directory will be
  783        created and a default file name "stats.json" will be used
  784        :type json_file: str
  785        :return: The function `print_stats` does not return any value. It has a return type annotation
  786        of `None`.
  787        """
  788
  789        # Full path
  790        output_file = full_path(output_file)
  791        json_file = full_path(json_file)
  792
  793        with tempfile.TemporaryDirectory() as tmpdir:
  794
  795            # Files
  796            if not output_file:
  797                output_file = os.path.join(tmpdir, "stats.md")
  798            if not json_file:
  799                json_file = os.path.join(tmpdir, "stats.json")
  800
  801            # Create folders
  802            if not os.path.exists(os.path.dirname(output_file)):
  803                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  804            if not os.path.exists(os.path.dirname(json_file)):
  805                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  806
  807            # Create stats JSON file
  808            stats_file = self.stats_to_file(file=json_file)
  809
  810            # Print stats file
  811            with open(stats_file) as f:
  812                stats = yaml.safe_load(f)
  813
  814            # Output
  815            output_title = []
  816            output_index = []
  817            output = []
  818
  819            # Title
  820            output_title.append("# HOWARD Stats")
  821
  822            # Index
  823            output_index.append("## Index")
  824
  825            # Process sections
  826            for section in stats:
  827                infos = stats.get(section)
  828                section_link = "#" + section.lower().replace(" ", "-")
  829                output.append(f"## {section}")
  830                output_index.append(f"- [{section}]({section_link})")
  831
  832                if len(infos):
  833                    for info in infos:
  834                        try:
  835                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  836                            is_df = True
  837                        except:
  838                            try:
  839                                df = pd.DataFrame.from_dict(
  840                                    json.loads((infos.get(info))), orient="index"
  841                                )
  842                                is_df = True
  843                            except:
  844                                is_df = False
  845                        if is_df:
  846                            output.append(f"### {info}")
  847                            info_link = "#" + info.lower().replace(" ", "-")
  848                            output_index.append(f"   - [{info}]({info_link})")
  849                            output.append(f"{df.to_markdown(index=False)}")
  850                        else:
  851                            output.append(f"- {info}: {infos.get(info)}")
  852                else:
  853                    output.append(f"NA")
  854
  855            # Write stats in markdown file
  856            with open(output_file, "w") as fp:
  857                for item in output_title:
  858                    fp.write("%s\n" % item)
  859                for item in output_index:
  860                    fp.write("%s\n" % item)
  861                for item in output:
  862                    fp.write("%s\n" % item)
  863
  864            # Output stats in markdown
  865            print("")
  866            print("\n\n".join(output_title))
  867            print("")
  868            print("\n\n".join(output))
  869            print("")
  870
  871        return None
  872
  873    def get_input(self) -> str:
  874        """
  875        It returns the value of the input variable.
  876        :return: The input is being returned.
  877        """
  878        return self.input
  879
  880    def get_input_format(self, input_file: str = None) -> str:
  881        """
  882        This function returns the format of the input variable, either from the provided input file or
  883        by prompting for input.
  884
  885        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  886        represents the file path of the input file. If no `input_file` is provided when calling the
  887        method, it will default to `None`
  888        :type input_file: str
  889        :return: The format of the input variable is being returned.
  890        """
  891
  892        if not input_file:
  893            input_file = self.get_input()
  894        input_format = get_file_format(input_file)
  895        return input_format
  896
  897    def get_input_compressed(self, input_file: str = None) -> str:
  898        """
  899        The function `get_input_compressed` returns the format of the input variable after compressing
  900        it.
  901
  902        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  903        that represents the file path of the input file. If no `input_file` is provided when calling the
  904        method, it will default to `None` and the method will then call `self.get_input()` to
  905        :type input_file: str
  906        :return: The function `get_input_compressed` returns the compressed format of the input
  907        variable.
  908        """
  909
  910        if not input_file:
  911            input_file = self.get_input()
  912        input_compressed = get_file_compressed(input_file)
  913        return input_compressed
  914
  915    def get_output(self) -> str:
  916        """
  917        It returns the output of the neuron.
  918        :return: The output of the neural network.
  919        """
  920
  921        return self.output
  922
  923    def get_output_format(self, output_file: str = None) -> str:
  924        """
  925        The function `get_output_format` returns the format of the input variable or the output file if
  926        provided.
  927
  928        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  929        that represents the file path of the output file. If no `output_file` is provided when calling
  930        the method, it will default to the output obtained from the `get_output` method of the class
  931        instance. The
  932        :type output_file: str
  933        :return: The format of the input variable is being returned.
  934        """
  935
  936        if not output_file:
  937            output_file = self.get_output()
  938        output_format = get_file_format(output_file)
  939
  940        return output_format
  941
  942    def get_config(self) -> dict:
  943        """
  944        It returns the config
  945        :return: The config variable is being returned.
  946        """
  947        return self.config
  948
  949    def get_param(self) -> dict:
  950        """
  951        It returns the param
  952        :return: The param variable is being returned.
  953        """
  954        return self.param
  955
  956    def get_connexion_db(self) -> str:
  957        """
  958        It returns the connexion_db attribute of the object
  959        :return: The connexion_db is being returned.
  960        """
  961        return self.connexion_db
  962
  963    def get_prefix(self) -> str:
  964        """
  965        It returns the prefix of the object.
  966        :return: The prefix is being returned.
  967        """
  968        return self.prefix
  969
  970    def get_table_variants(self, clause: str = "select") -> str:
  971        """
  972        This function returns the table_variants attribute of the object
  973
  974        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
  975        defaults to select (optional)
  976        :return: The table_variants attribute of the object.
  977        """
  978
  979        # Access
  980        access = self.get_config().get("access", None)
  981
  982        # Clauses "select", "where", "update"
  983        if clause in ["select", "where", "update"]:
  984            table_variants = self.table_variants
  985        # Clause "from"
  986        elif clause in ["from"]:
  987            # For Read Only
  988            if self.get_input_format() in ["parquet"] and access in ["RO"]:
  989                input_file = self.get_input()
  990                table_variants = f"'{input_file}' as variants"
  991            # For Read Write
  992            else:
  993                table_variants = f"{self.table_variants} as variants"
  994        else:
  995            table_variants = self.table_variants
  996        return table_variants
  997
  998    def get_tmp_dir(self) -> str:
  999        """
 1000        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1001        parameters or a default path.
 1002        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1003        configuration, parameters, and a default value of "/tmp".
 1004        """
 1005
 1006        return get_tmp(
 1007            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1008        )
 1009
 1010    def get_connexion_type(self) -> str:
 1011        """
 1012        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1013
 1014        :return: The connexion type is being returned.
 1015        """
 1016        return self.get_config().get("connexion_type", "memory")
 1017
 1018    def get_connexion(self):
 1019        """
 1020        It returns the connection object
 1021
 1022        :return: The connection object.
 1023        """
 1024        return self.conn
 1025
 1026    def close_connexion(self) -> None:
 1027        """
 1028        This function closes the connection to the database.
 1029        :return: The connection is being closed.
 1030        """
 1031        return self.conn.close()
 1032
 1033    def get_header(self, type: str = "vcf"):
 1034        """
 1035        This function returns the header of the VCF file as a list of strings
 1036
 1037        :param type: the type of header you want to get, defaults to vcf (optional)
 1038        :return: The header of the vcf file.
 1039        """
 1040
 1041        if self.header_vcf:
 1042            if type == "vcf":
 1043                return self.header_vcf
 1044            elif type == "list":
 1045                return self.header_list
 1046        else:
 1047            if type == "vcf":
 1048                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1049                return header
 1050            elif type == "list":
 1051                return vcf_required
 1052
 1053    def get_header_length(self, file: str = None) -> int:
 1054        """
 1055        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1056        line.
 1057
 1058        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1059        header file. If this argument is provided, the function will read the header from the specified
 1060        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1061        :type file: str
 1062        :return: the length of the header list, excluding the #CHROM line.
 1063        """
 1064
 1065        if file:
 1066            return len(self.read_vcf_header_file(file=file)) - 1
 1067        elif self.get_header(type="list"):
 1068            return len(self.get_header(type="list")) - 1
 1069        else:
 1070            return 0
 1071
 1072    def get_header_columns(self) -> str:
 1073        """
 1074        This function returns the header list of a VCF
 1075
 1076        :return: The length of the header list.
 1077        """
 1078        if self.get_header():
 1079            return self.get_header(type="list")[-1]
 1080        else:
 1081            return ""
 1082
 1083    def get_header_columns_as_list(self) -> list:
 1084        """
 1085        This function returns the header list of a VCF
 1086
 1087        :return: The length of the header list.
 1088        """
 1089        if self.get_header():
 1090            return self.get_header_columns().strip().split("\t")
 1091        else:
 1092            return []
 1093
 1094    def get_header_columns_as_sql(self) -> str:
 1095        """
 1096        This function retruns header length (without #CHROM line)
 1097
 1098        :return: The length of the header list.
 1099        """
 1100        sql_column_list = []
 1101        for col in self.get_header_columns_as_list():
 1102            sql_column_list.append(f'"{col}"')
 1103        return ",".join(sql_column_list)
 1104
 1105    def get_header_sample_list(self) -> list:
 1106        """
 1107        This function retruns header length (without #CHROM line)
 1108
 1109        :return: The length of the header list.
 1110        """
 1111        return self.header_vcf.samples
 1112
 1113    def get_verbose(self) -> bool:
 1114        """
 1115        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1116        exist
 1117
 1118        :return: The value of the key "verbose" in the config dictionary.
 1119        """
 1120        return self.get_config().get("verbose", False)
 1121
 1122    def get_connexion_format(self) -> str:
 1123        """
 1124        It returns the connexion format of the object.
 1125        :return: The connexion_format is being returned.
 1126        """
 1127        connexion_format = self.connexion_format
 1128        if connexion_format not in ["duckdb", "sqlite"]:
 1129            log.error(f"Unknown connexion format {connexion_format}")
 1130            raise ValueError(f"Unknown connexion format {connexion_format}")
 1131        else:
 1132            return connexion_format
 1133
 1134    def insert_file_to_table(
 1135        self,
 1136        file,
 1137        columns: str,
 1138        header_len: int = 0,
 1139        sep: str = "\t",
 1140        chunksize: int = 1000000,
 1141    ) -> None:
 1142        """
 1143        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1144        database format.
 1145
 1146        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1147        the path to the file on your system
 1148        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1149        should contain the names of the columns in the table where the data will be inserted. The column
 1150        names should be separated by commas within the string. For example, if you have columns named
 1151        "id", "name
 1152        :type columns: str
 1153        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1154        the number of lines to skip at the beginning of the file before reading the actual data. This
 1155        parameter allows you to skip any header information present in the file before processing the
 1156        data, defaults to 0
 1157        :type header_len: int (optional)
 1158        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1159        separator character that is used in the file being read. In this case, the default separator is
 1160        set to `\t`, which represents a tab character. You can change this parameter to a different
 1161        separator character if, defaults to \t
 1162        :type sep: str (optional)
 1163        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1164        when processing the file in chunks. In the provided code snippet, the default value for
 1165        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1166        to 1000000
 1167        :type chunksize: int (optional)
 1168        """
 1169
 1170        # Config
 1171        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1172        connexion_format = self.get_connexion_format()
 1173
 1174        log.debug("chunksize: " + str(chunksize))
 1175
 1176        if chunksize:
 1177            for chunk in pd.read_csv(
 1178                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1179            ):
 1180                if connexion_format in ["duckdb"]:
 1181                    sql_insert_into = (
 1182                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1183                    )
 1184                    self.conn.execute(sql_insert_into)
 1185                elif connexion_format in ["sqlite"]:
 1186                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1187
 1188    def load_data(
 1189        self,
 1190        input_file: str = None,
 1191        drop_variants_table: bool = False,
 1192        sample_size: int = 20480,
 1193    ) -> None:
 1194        """
 1195        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1196        table before loading the data and specify a sample size.
 1197
 1198        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1199        table
 1200        :type input_file: str
 1201        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1202        determines whether the variants table should be dropped before loading the data. If set to
 1203        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1204        not be dropped, defaults to False
 1205        :type drop_variants_table: bool (optional)
 1206        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1207        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1208        20480
 1209        :type sample_size: int (optional)
 1210        """
 1211
 1212        log.info("Loading...")
 1213
 1214        # change input file
 1215        if input_file:
 1216            self.set_input(input_file)
 1217            self.set_header()
 1218
 1219        # drop variants table
 1220        if drop_variants_table:
 1221            self.drop_variants_table()
 1222
 1223        # get table variants
 1224        table_variants = self.get_table_variants()
 1225
 1226        # Access
 1227        access = self.get_config().get("access", None)
 1228        log.debug(f"access: {access}")
 1229
 1230        # Input format and compress
 1231        input_format = self.get_input_format()
 1232        input_compressed = self.get_input_compressed()
 1233        log.debug(f"input_format: {input_format}")
 1234        log.debug(f"input_compressed: {input_compressed}")
 1235
 1236        # input_compressed_format
 1237        if input_compressed:
 1238            input_compressed_format = "gzip"
 1239        else:
 1240            input_compressed_format = "none"
 1241        log.debug(f"input_compressed_format: {input_compressed_format}")
 1242
 1243        # Connexion format
 1244        connexion_format = self.get_connexion_format()
 1245
 1246        # Sample size
 1247        if not sample_size:
 1248            sample_size = -1
 1249        log.debug(f"sample_size: {sample_size}")
 1250
 1251        # Load data
 1252        log.debug(f"Load Data from {input_format}")
 1253
 1254        # DuckDB connexion
 1255        if connexion_format in ["duckdb"]:
 1256
 1257            # Database already exists
 1258            if self.input_format in ["db", "duckdb"]:
 1259
 1260                if connexion_format in ["duckdb"]:
 1261                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1262                else:
 1263                    log.error(
 1264                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1265                    )
 1266                    raise ValueError(
 1267                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1268                    )
 1269
 1270            # Load from existing database format
 1271            else:
 1272
 1273                try:
 1274                    # Create Table or View
 1275                    database = Database(database=self.input)
 1276                    sql_from = database.get_sql_from(sample_size=sample_size)
 1277
 1278                    if access in ["RO"]:
 1279                        sql_load = (
 1280                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1281                        )
 1282                    else:
 1283                        sql_load = (
 1284                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1285                        )
 1286                    self.conn.execute(sql_load)
 1287
 1288                except:
 1289                    # Format not available
 1290                    log.error(f"Input file format '{self.input_format}' not available")
 1291                    raise ValueError(
 1292                        f"Input file format '{self.input_format}' not available"
 1293                    )
 1294
 1295        # SQLite connexion
 1296        elif connexion_format in ["sqlite"] and input_format in [
 1297            "vcf",
 1298            "tsv",
 1299            "csv",
 1300            "psv",
 1301        ]:
 1302
 1303            # Main structure
 1304            structure = {
 1305                "#CHROM": "VARCHAR",
 1306                "POS": "INTEGER",
 1307                "ID": "VARCHAR",
 1308                "REF": "VARCHAR",
 1309                "ALT": "VARCHAR",
 1310                "QUAL": "VARCHAR",
 1311                "FILTER": "VARCHAR",
 1312                "INFO": "VARCHAR",
 1313            }
 1314
 1315            # Strcuture with samples
 1316            structure_complete = structure
 1317            if self.get_header_sample_list():
 1318                structure["FORMAT"] = "VARCHAR"
 1319                for sample in self.get_header_sample_list():
 1320                    structure_complete[sample] = "VARCHAR"
 1321
 1322            # Columns list for create and insert
 1323            sql_create_table_columns = []
 1324            sql_create_table_columns_list = []
 1325            for column in structure_complete:
 1326                column_type = structure_complete[column]
 1327                sql_create_table_columns.append(
 1328                    f'"{column}" {column_type} default NULL'
 1329                )
 1330                sql_create_table_columns_list.append(f'"{column}"')
 1331
 1332            # Create database
 1333            log.debug(f"Create Table {table_variants}")
 1334            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1335            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1336            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1337            self.conn.execute(sql_create_table)
 1338
 1339            # chunksize define length of file chunk load file
 1340            chunksize = 100000
 1341
 1342            # delimiter
 1343            delimiter = file_format_delimiters.get(input_format, "\t")
 1344
 1345            # Load the input file
 1346            with open(self.input, "rt") as input_file:
 1347
 1348                # Use the appropriate file handler based on the input format
 1349                if input_compressed:
 1350                    input_file = bgzf.open(self.input, "rt")
 1351                if input_format in ["vcf"]:
 1352                    header_len = self.get_header_length()
 1353                else:
 1354                    header_len = 0
 1355
 1356                # Insert the file contents into a table
 1357                self.insert_file_to_table(
 1358                    input_file,
 1359                    columns=sql_create_table_columns_list_sql,
 1360                    header_len=header_len,
 1361                    sep=delimiter,
 1362                    chunksize=chunksize,
 1363                )
 1364
 1365        else:
 1366            log.error(
 1367                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1368            )
 1369            raise ValueError(
 1370                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1371            )
 1372
 1373        # Explode INFOS fields into table fields
 1374        if self.get_explode_infos():
 1375            self.explode_infos(
 1376                prefix=self.get_explode_infos_prefix(),
 1377                fields=self.get_explode_infos_fields(),
 1378                force=True,
 1379            )
 1380
 1381        # Create index after insertion
 1382        self.create_indexes()
 1383
 1384    def get_explode_infos(self) -> bool:
 1385        """
 1386        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1387        to False if it is not set.
 1388        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1389        value. If the parameter is not present, it will return False.
 1390        """
 1391
 1392        return self.get_param().get("explode", {}).get("explode_infos", False)
 1393
 1394    def get_explode_infos_fields(
 1395        self,
 1396        explode_infos_fields: str = None,
 1397        remove_fields_not_in_header: bool = False,
 1398    ) -> list:
 1399        """
 1400        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1401        the input parameter `explode_infos_fields`.
 1402
 1403        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1404        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1405        comma-separated list of field names to explode
 1406        :type explode_infos_fields: str
 1407        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1408        flag that determines whether to remove fields that are not present in the header. If it is set
 1409        to `True`, any field that is not in the header will be excluded from the list of exploded
 1410        information fields. If it is set to `, defaults to False
 1411        :type remove_fields_not_in_header: bool (optional)
 1412        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1413        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1414        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1415        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1416        splitting the string by commas.
 1417        """
 1418
 1419        # If no fields, get it in param
 1420        if not explode_infos_fields:
 1421            explode_infos_fields = (
 1422                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1423            )
 1424
 1425        # If no fields, defined as all fields in header using keyword
 1426        if not explode_infos_fields:
 1427            explode_infos_fields = "*"
 1428
 1429        # If fields list not empty
 1430        if explode_infos_fields:
 1431
 1432            # Input fields list
 1433            if isinstance(explode_infos_fields, str):
 1434                fields_input = explode_infos_fields.split(",")
 1435            elif isinstance(explode_infos_fields, list):
 1436                fields_input = explode_infos_fields
 1437            else:
 1438                fields_input = []
 1439
 1440            # Fields list without * keyword
 1441            fields_without_all = fields_input.copy()
 1442            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1443                fields_without_all.remove("*")
 1444
 1445            # Fields in header
 1446            fields_in_header = sorted(list(set(self.get_header().infos)))
 1447
 1448            # Construct list of fields
 1449            fields_output = []
 1450            for field in fields_input:
 1451
 1452                # Strip field
 1453                field = field.strip()
 1454
 1455                # format keyword * in regex
 1456                if field.upper() in ["*"]:
 1457                    field = ".*"
 1458
 1459                # Find all fields with pattern
 1460                r = re.compile(field)
 1461                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1462
 1463                # Remove fields input from search
 1464                if field in fields_search:
 1465                    fields_search = [field]
 1466                elif fields_search != [field]:
 1467                    fields_search = sorted(
 1468                        list(set(fields_search).difference(fields_input))
 1469                    )
 1470
 1471                # If field is not in header (avoid not well formatted header)
 1472                if not fields_search and not remove_fields_not_in_header:
 1473                    fields_search = [field]
 1474
 1475                # Add found fields
 1476                for new_field in fields_search:
 1477                    # Add field, if not already exists, and if it is in header (if asked)
 1478                    if (
 1479                        new_field not in fields_output
 1480                        and (
 1481                            not remove_fields_not_in_header
 1482                            or new_field in fields_in_header
 1483                        )
 1484                        and new_field not in [".*"]
 1485                    ):
 1486                        fields_output.append(new_field)
 1487
 1488            return fields_output
 1489
 1490        else:
 1491
 1492            return []
 1493
 1494    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1495        """
 1496        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1497        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1498        not provided.
 1499
 1500        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1501        prefix to be used for exploding or expanding information
 1502        :type explode_infos_prefix: str
 1503        :return: the value of the variable `explode_infos_prefix`.
 1504        """
 1505
 1506        if not explode_infos_prefix:
 1507            explode_infos_prefix = (
 1508                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1509            )
 1510
 1511        return explode_infos_prefix
 1512
 1513    def add_column(
 1514        self,
 1515        table_name,
 1516        column_name,
 1517        column_type,
 1518        default_value=None,
 1519        drop: bool = False,
 1520    ) -> dict:
 1521        """
 1522        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1523        doesn't already exist.
 1524
 1525        :param table_name: The name of the table to which you want to add a column
 1526        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1527        to the table
 1528        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1529        want to add to the table. It should be a string that represents the desired data type, such as
 1530        "INTEGER", "TEXT", "REAL", etc
 1531        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1532        default value for the newly added column. If a default value is provided, it will be assigned to
 1533        the column for any existing rows that do not have a value for that column
 1534        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1535        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1536        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1537        to False
 1538        :type drop: bool (optional)
 1539        :return: a boolean value indicating whether the column was successfully added to the table.
 1540        """
 1541
 1542        # added
 1543        added = False
 1544        dropped = False
 1545
 1546        # Check if the column already exists in the table
 1547        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1548        columns = self.get_query_to_df(query).columns.tolist()
 1549        if column_name.upper() in [c.upper() for c in columns]:
 1550            log.debug(
 1551                f"The {column_name} column already exists in the {table_name} table"
 1552            )
 1553            if drop:
 1554                self.drop_column(table_name=table_name, column_name=column_name)
 1555                dropped = True
 1556            else:
 1557                return None
 1558        else:
 1559            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1560
 1561        # Add column in table
 1562        add_column_query = (
 1563            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1564        )
 1565        if default_value is not None:
 1566            add_column_query += f" DEFAULT {default_value}"
 1567        self.execute_query(add_column_query)
 1568        added = not dropped
 1569        log.debug(
 1570            f"The {column_name} column was successfully added to the {table_name} table"
 1571        )
 1572
 1573        if added:
 1574            added_column = {
 1575                "table_name": table_name,
 1576                "column_name": column_name,
 1577                "column_type": column_type,
 1578                "default_value": default_value,
 1579            }
 1580        else:
 1581            added_column = None
 1582
 1583        return added_column
 1584
 1585    def drop_column(
 1586        self, column: dict = None, table_name: str = None, column_name: str = None
 1587    ) -> bool:
 1588        """
 1589        The `drop_column` function drops a specified column from a given table in a database and returns
 1590        True if the column was successfully dropped, and False if the column does not exist in the
 1591        table.
 1592
 1593        :param column: The `column` parameter is a dictionary that contains information about the column
 1594        you want to drop. It has two keys:
 1595        :type column: dict
 1596        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1597        drop a column
 1598        :type table_name: str
 1599        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1600        from the table
 1601        :type column_name: str
 1602        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1603        and False if the column does not exist in the table.
 1604        """
 1605
 1606        # Find column infos
 1607        if column:
 1608            if isinstance(column, dict):
 1609                table_name = column.get("table_name", None)
 1610                column_name = column.get("column_name", None)
 1611            elif isinstance(column, str):
 1612                table_name = self.get_table_variants()
 1613                column_name = column
 1614            else:
 1615                table_name = None
 1616                column_name = None
 1617
 1618        if not table_name and not column_name:
 1619            return False
 1620
 1621        # Removed
 1622        removed = False
 1623
 1624        # Check if the column already exists in the table
 1625        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1626        columns = self.get_query_to_df(query).columns.tolist()
 1627        if column_name in columns:
 1628            log.debug(f"The {column_name} column exists in the {table_name} table")
 1629        else:
 1630            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1631            return False
 1632
 1633        # Add column in table # ALTER TABLE integers DROP k
 1634        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1635        self.execute_query(add_column_query)
 1636        removed = True
 1637        log.debug(
 1638            f"The {column_name} column was successfully dropped to the {table_name} table"
 1639        )
 1640
 1641        return removed
 1642
 1643    def explode_infos(
 1644        self,
 1645        prefix: str = None,
 1646        create_index: bool = False,
 1647        fields: list = None,
 1648        force: bool = False,
 1649        proccess_all_fields_together: bool = False,
 1650        table: str = None,
 1651    ) -> list:
 1652        """
 1653        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1654        individual columns, returning a list of added columns.
 1655
 1656        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1657        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1658        `self.get_explode_infos_prefix()` as the prefix
 1659        :type prefix: str
 1660        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1661        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1662        `False`, indexes will not be created. The default value is `False`, defaults to False
 1663        :type create_index: bool (optional)
 1664        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1665        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1666        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1667        a list to the `
 1668        :type fields: list
 1669        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1670        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1671        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1672        defaults to False
 1673        :type force: bool (optional)
 1674        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1675        flag that determines whether to process all the INFO fields together or individually. If set to
 1676        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1677        be processed individually. The default value is, defaults to False
 1678        :type proccess_all_fields_together: bool (optional)
 1679        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1680        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1681        a value for the `table` parameter, the function will use that table name. If the `table`
 1682        parameter is
 1683        :type table: str
 1684        :return: The `explode_infos` function returns a list of added columns.
 1685        """
 1686
 1687        # drop indexes
 1688        self.drop_indexes()
 1689
 1690        # connexion format
 1691        connexion_format = self.get_connexion_format()
 1692
 1693        # Access
 1694        access = self.get_config().get("access", None)
 1695
 1696        # Added columns
 1697        added_columns = []
 1698
 1699        if access not in ["RO"]:
 1700
 1701            # prefix
 1702            if prefix in [None, True] or not isinstance(prefix, str):
 1703                if self.get_explode_infos_prefix() not in [None, True]:
 1704                    prefix = self.get_explode_infos_prefix()
 1705                else:
 1706                    prefix = "INFO/"
 1707
 1708            # table variants
 1709            if table is not None:
 1710                table_variants = table
 1711            else:
 1712                table_variants = self.get_table_variants(clause="select")
 1713
 1714            # extra infos
 1715            try:
 1716                extra_infos = self.get_extra_infos()
 1717            except:
 1718                extra_infos = []
 1719
 1720            # Header infos
 1721            header_infos = self.get_header().infos
 1722
 1723            log.debug(
 1724                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1725            )
 1726
 1727            sql_info_alter_table_array = []
 1728
 1729            # Info fields to check
 1730            fields_list = list(header_infos)
 1731            if fields:
 1732                fields_list += fields
 1733            fields_list = set(fields_list)
 1734
 1735            # If no fields
 1736            if not fields:
 1737                fields = []
 1738
 1739            # Translate fields if patterns
 1740            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1741
 1742            for info in fields:
 1743
 1744                info_id_sql = prefix + info
 1745
 1746                if (
 1747                    info in fields_list
 1748                    or prefix + info in fields_list
 1749                    or info in extra_infos
 1750                ):
 1751
 1752                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1753
 1754                    if info in header_infos:
 1755                        info_type = header_infos[info].type
 1756                        info_num = header_infos[info].num
 1757                    else:
 1758                        info_type = "String"
 1759                        info_num = 0
 1760
 1761                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1762                    if info_num != 1:
 1763                        type_sql = "VARCHAR"
 1764
 1765                    # Add field
 1766                    added_column = self.add_column(
 1767                        table_name=table_variants,
 1768                        column_name=info_id_sql,
 1769                        column_type=type_sql,
 1770                        default_value="null",
 1771                        drop=force,
 1772                    )
 1773
 1774                    if added_column:
 1775                        added_columns.append(added_column)
 1776
 1777                    if added_column or force:
 1778
 1779                        # add field to index
 1780                        self.index_additionnal_fields.append(info_id_sql)
 1781
 1782                        # Update field array
 1783                        if connexion_format in ["duckdb"]:
 1784                            update_info_field = f"""
 1785                            "{info_id_sql}" =
 1786                                CASE
 1787                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1788                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1789                                END
 1790                            """
 1791                        elif connexion_format in ["sqlite"]:
 1792                            update_info_field = f"""
 1793                                "{info_id_sql}" =
 1794                                    CASE
 1795                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1796                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1797                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1798                                    END
 1799                            """
 1800
 1801                        sql_info_alter_table_array.append(update_info_field)
 1802
 1803            if sql_info_alter_table_array:
 1804
 1805                # By chromosomes
 1806                try:
 1807                    chromosomes_list = list(
 1808                        self.get_query_to_df(
 1809                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1810                        )["#CHROM"]
 1811                    )
 1812                except:
 1813                    chromosomes_list = [None]
 1814
 1815                for chrom in chromosomes_list:
 1816                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1817
 1818                    # Where clause
 1819                    where_clause = ""
 1820                    if chrom and len(chromosomes_list) > 1:
 1821                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1822
 1823                    # Update table
 1824                    if proccess_all_fields_together:
 1825                        sql_info_alter_table_array_join = ", ".join(
 1826                            sql_info_alter_table_array
 1827                        )
 1828                        if sql_info_alter_table_array_join:
 1829                            sql_info_alter_table = f"""
 1830                                UPDATE {table_variants}
 1831                                SET {sql_info_alter_table_array_join}
 1832                                {where_clause}
 1833                                """
 1834                            log.debug(
 1835                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1836                            )
 1837                            # log.debug(sql_info_alter_table)
 1838                            self.conn.execute(sql_info_alter_table)
 1839                    else:
 1840                        sql_info_alter_num = 0
 1841                        for sql_info_alter in sql_info_alter_table_array:
 1842                            sql_info_alter_num += 1
 1843                            sql_info_alter_table = f"""
 1844                                UPDATE {table_variants}
 1845                                SET {sql_info_alter}
 1846                                {where_clause}
 1847                                """
 1848                            log.debug(
 1849                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1850                            )
 1851                            # log.debug(sql_info_alter_table)
 1852                            self.conn.execute(sql_info_alter_table)
 1853
 1854        # create indexes
 1855        if create_index:
 1856            self.create_indexes()
 1857
 1858        return added_columns
 1859
 1860    def create_indexes(self) -> None:
 1861        """
 1862        Create indexes on the table after insertion
 1863        """
 1864
 1865        # Access
 1866        access = self.get_config().get("access", None)
 1867
 1868        # get table variants
 1869        table_variants = self.get_table_variants("FROM")
 1870
 1871        if self.get_indexing() and access not in ["RO"]:
 1872            # Create index
 1873            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 1874            self.conn.execute(sql_create_table_index)
 1875            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 1876            self.conn.execute(sql_create_table_index)
 1877            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 1878            self.conn.execute(sql_create_table_index)
 1879            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 1880            self.conn.execute(sql_create_table_index)
 1881            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 1882            self.conn.execute(sql_create_table_index)
 1883            for field in self.index_additionnal_fields:
 1884                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 1885                self.conn.execute(sql_create_table_index)
 1886
 1887    def drop_indexes(self) -> None:
 1888        """
 1889        Create indexes on the table after insertion
 1890        """
 1891
 1892        # Access
 1893        access = self.get_config().get("access", None)
 1894
 1895        # get table variants
 1896        table_variants = self.get_table_variants("FROM")
 1897
 1898        # Get database format
 1899        connexion_format = self.get_connexion_format()
 1900
 1901        if access not in ["RO"]:
 1902            if connexion_format in ["duckdb"]:
 1903                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 1904            elif connexion_format in ["sqlite"]:
 1905                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 1906
 1907            list_indexes = self.conn.execute(sql_list_indexes)
 1908            index_names = [row[0] for row in list_indexes.fetchall()]
 1909            for index in index_names:
 1910                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 1911                self.conn.execute(sql_drop_table_index)
 1912
 1913    def read_vcf_header(self, f) -> list:
 1914        """
 1915        It reads the header of a VCF file and returns a list of the header lines
 1916
 1917        :param f: the file object
 1918        :return: The header lines of the VCF file.
 1919        """
 1920
 1921        header_list = []
 1922        for line in f:
 1923            header_list.append(line)
 1924            if line.startswith("#CHROM"):
 1925                break
 1926        return header_list
 1927
 1928    def read_vcf_header_file(self, file: str = None) -> list:
 1929        """
 1930        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 1931        uncompressed files.
 1932
 1933        :param file: The `file` parameter is a string that represents the path to the VCF header file
 1934        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 1935        default to `None`
 1936        :type file: str
 1937        :return: The function `read_vcf_header_file` returns a list.
 1938        """
 1939
 1940        if self.get_input_compressed(input_file=file):
 1941            with bgzf.open(file, "rt") as f:
 1942                return self.read_vcf_header(f=f)
 1943        else:
 1944            with open(file, "rt") as f:
 1945                return self.read_vcf_header(f=f)
 1946
 1947    def execute_query(self, query: str):
 1948        """
 1949        It takes a query as an argument, executes it, and returns the results
 1950
 1951        :param query: The query to be executed
 1952        :return: The result of the query is being returned.
 1953        """
 1954        if query:
 1955            return self.conn.execute(query)  # .fetchall()
 1956        else:
 1957            return None
 1958
 1959    def export_output(
 1960        self,
 1961        output_file: str | None = None,
 1962        output_header: str | None = None,
 1963        export_header: bool = True,
 1964        query: str | None = None,
 1965        parquet_partitions: list | None = None,
 1966        chunk_size: int | None = None,
 1967        threads: int | None = None,
 1968        sort: bool = False,
 1969        index: bool = False,
 1970        order_by: str | None = None,
 1971    ) -> bool:
 1972        """
 1973        The `export_output` function exports data from a VCF file to a specified output file in various
 1974        formats, including VCF, CSV, TSV, PSV, and Parquet.
 1975
 1976        :param output_file: The `output_file` parameter is a string that specifies the name of the
 1977        output file to be generated by the function. This is where the exported data will be saved
 1978        :type output_file: str
 1979        :param output_header: The `output_header` parameter is a string that specifies the name of the
 1980        file where the header of the VCF file will be exported. If this parameter is not provided, the
 1981        header will be exported to a file with the same name as the `output_file` parameter, but with
 1982        the extension "
 1983        :type output_header: str
 1984        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 1985        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 1986        True, the header will be exported to a file. If `export_header` is False, the header will not
 1987        be, defaults to True, if output format is not VCF
 1988        :type export_header: bool (optional)
 1989        :param query: The `query` parameter is an optional SQL query that can be used to filter and
 1990        select specific data from the VCF file before exporting it. If provided, only the data that
 1991        matches the query will be exported
 1992        :type query: str
 1993        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 1994        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 1995        organize data in a hierarchical directory structure based on the values of one or more columns.
 1996        This can improve query performance when working with large datasets
 1997        :type parquet_partitions: list
 1998        :param chunk_size: The `chunk_size` parameter specifies the number of
 1999        records in batch when exporting data in Parquet format. This parameter is used for
 2000        partitioning the Parquet file into multiple files.
 2001        :type chunk_size: int
 2002        :param threads: The `threads` parameter is an optional parameter that specifies the number of
 2003        threads to be used during the export process. It determines the level of parallelism and can
 2004        improve the performance of the export operation. If not provided, the function will use the
 2005        default number of threads
 2006        :type threads: int
 2007        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
 2008        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
 2009        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
 2010        False
 2011        :type sort: bool (optional)
 2012        :param index: The `index` parameter is a boolean flag that determines whether an index should be
 2013        created on the output file. If `index` is True, an index will be created. If `index` is False,
 2014        no index will be created. The default value is False, defaults to False
 2015        :type index: bool (optional)
 2016        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
 2017        sorting the output file. This parameter is only applicable when exporting data in VCF format
 2018        :type order_by: str
 2019        :return: a boolean value. It checks if the output file exists and returns True if it does, or
 2020        None if it doesn't.
 2021        """
 2022
 2023        # Log
 2024        log.info("Exporting...")
 2025
 2026        # Full path
 2027        output_file = full_path(output_file)
 2028        output_header = full_path(output_header)
 2029
 2030        # Config
 2031        config = self.get_config()
 2032
 2033        # Param
 2034        param = self.get_param()
 2035
 2036        # Tmp files to remove
 2037        tmp_to_remove = []
 2038
 2039        # If no output, get it
 2040        if not output_file:
 2041            output_file = self.get_output()
 2042
 2043        # If not threads
 2044        if not threads:
 2045            threads = self.get_threads()
 2046
 2047        # Auto header name with extension
 2048        if export_header or output_header:
 2049            if not output_header:
 2050                output_header = f"{output_file}.hdr"
 2051            # Export header
 2052            self.export_header(output_file=output_file)
 2053
 2054        # Switch off export header if VCF output
 2055        output_file_type = get_file_format(output_file)
 2056        if output_file_type in ["vcf"]:
 2057            export_header = False
 2058            tmp_to_remove.append(output_header)
 2059
 2060        # Chunk size
 2061        if not chunk_size:
 2062            chunk_size = config.get("chunk_size", None)
 2063
 2064        # Parquet partition
 2065        if not parquet_partitions:
 2066            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2067        if parquet_partitions and isinstance(parquet_partitions, str):
 2068            parquet_partitions = parquet_partitions.split(",")
 2069
 2070        # Order by
 2071        if not order_by:
 2072            order_by = param.get("export", {}).get("order_by", "")
 2073
 2074        # Header in output
 2075        header_in_output = param.get("export", {}).get("include_header", False)
 2076
 2077        # Database
 2078        database_source = self.get_connexion()
 2079
 2080        # Connexion format
 2081        connexion_format = self.get_connexion_format()
 2082
 2083        # Explode infos
 2084        if self.get_explode_infos():
 2085            self.explode_infos(
 2086                prefix=self.get_explode_infos_prefix(),
 2087                fields=self.get_explode_infos_fields(),
 2088                force=False,
 2089            )
 2090
 2091        # if connexion_format in ["sqlite"] or query:
 2092        if connexion_format in ["sqlite"]:
 2093
 2094            # Export in Parquet
 2095            random_tmp = "".join(
 2096                random.choice(string.ascii_lowercase) for i in range(10)
 2097            )
 2098            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2099            tmp_to_remove.append(database_source)
 2100
 2101            # Table Variants
 2102            table_variants = self.get_table_variants()
 2103
 2104            # Create export query
 2105            sql_query_export_subquery = f"""
 2106                SELECT * FROM {table_variants}
 2107                """
 2108
 2109            # Write source file
 2110            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2111
 2112        # Create database
 2113        database = Database(
 2114            database=database_source,
 2115            table="variants",
 2116            header_file=output_header,
 2117            conn_config=self.get_connexion_config(),
 2118        )
 2119
 2120        # Existing colomns header
 2121        # existing_columns_header = database.get_header_file_columns(output_header)
 2122        existing_columns_header = database.get_header_columns_from_database()
 2123
 2124        # Export file
 2125        database.export(
 2126            output_database=output_file,
 2127            output_header=output_header,
 2128            existing_columns_header=existing_columns_header,
 2129            parquet_partitions=parquet_partitions,
 2130            chunk_size=chunk_size,
 2131            threads=threads,
 2132            sort=sort,
 2133            index=index,
 2134            header_in_output=header_in_output,
 2135            order_by=order_by,
 2136            query=query,
 2137            export_header=export_header,
 2138        )
 2139
 2140        # Remove
 2141        remove_if_exists(tmp_to_remove)
 2142
 2143        return (os.path.exists(output_file) or None) and (
 2144            os.path.exists(output_file) or None
 2145        )
 2146
 2147    def get_extra_infos(self, table: str = None) -> list:
 2148        """
 2149        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2150        in the header.
 2151
 2152        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2153        name of the table from which you want to retrieve the extra columns that are not present in the
 2154        header. If the `table` parameter is not provided when calling the function, it will default to
 2155        using the variants
 2156        :type table: str
 2157        :return: A list of columns that are in the specified table but not in the header of the table.
 2158        """
 2159
 2160        header_columns = []
 2161
 2162        if not table:
 2163            table = self.get_table_variants(clause="from")
 2164            header_columns = self.get_header_columns()
 2165
 2166        # Check all columns in the database
 2167        query = f""" SELECT * FROM {table} LIMIT 1 """
 2168        log.debug(f"query {query}")
 2169        table_columns = self.get_query_to_df(query).columns.tolist()
 2170        extra_columns = []
 2171
 2172        # Construct extra infos (not in header)
 2173        for column in table_columns:
 2174            if column not in header_columns:
 2175                extra_columns.append(column)
 2176
 2177        return extra_columns
 2178
 2179    def get_extra_infos_sql(self, table: str = None) -> str:
 2180        """
 2181        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2182        by double quotes
 2183
 2184        :param table: The name of the table to get the extra infos from. If None, the default table is
 2185        used
 2186        :type table: str
 2187        :return: A string of the extra infos
 2188        """
 2189
 2190        return ", ".join(
 2191            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2192        )
 2193
 2194    def export_header(
 2195        self,
 2196        header_name: str = None,
 2197        output_file: str = None,
 2198        output_file_ext: str = ".hdr",
 2199        clean_header: bool = True,
 2200        remove_chrom_line: bool = False,
 2201    ) -> str:
 2202        """
 2203        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2204        specified options, and writes it to a new file.
 2205
 2206        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2207        this parameter is not specified, the header will be written to the output file
 2208        :type header_name: str
 2209        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2210        specify the name of the output file where the header will be written. If this parameter is not
 2211        provided, the header will be written to a temporary file
 2212        :type output_file: str
 2213        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2214        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2215        if not specified by the user. This extension will be appended to the `output_file` name to
 2216        create the final, defaults to .hdr
 2217        :type output_file_ext: str (optional)
 2218        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2219        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2220        `True`, the function will clean the header by modifying certain lines based on a specific
 2221        pattern. If `clean_header`, defaults to True
 2222        :type clean_header: bool (optional)
 2223        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2224        boolean flag that determines whether the #CHROM line should be removed from the header before
 2225        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2226        defaults to False
 2227        :type remove_chrom_line: bool (optional)
 2228        :return: The function `export_header` returns the name of the temporary header file that is
 2229        created.
 2230        """
 2231
 2232        if not header_name and not output_file:
 2233            output_file = self.get_output()
 2234
 2235        if self.get_header():
 2236
 2237            # Get header object
 2238            header_obj = self.get_header()
 2239
 2240            # Create database
 2241            db_for_header = Database(database=self.get_input())
 2242
 2243            # Get real columns in the file
 2244            db_header_columns = db_for_header.get_columns()
 2245
 2246            with tempfile.TemporaryDirectory() as tmpdir:
 2247
 2248                # Write header file
 2249                header_file_tmp = os.path.join(tmpdir, "header")
 2250                f = open(header_file_tmp, "w")
 2251                vcf.Writer(f, header_obj)
 2252                f.close()
 2253
 2254                # Replace #CHROM line with rel columns
 2255                header_list = db_for_header.read_header_file(
 2256                    header_file=header_file_tmp
 2257                )
 2258                header_list[-1] = "\t".join(db_header_columns)
 2259
 2260                # Remove CHROM line
 2261                if remove_chrom_line:
 2262                    header_list.pop()
 2263
 2264                # Clean header
 2265                if clean_header:
 2266                    header_list_clean = []
 2267                    for head in header_list:
 2268                        # Clean head for malformed header
 2269                        head_clean = head
 2270                        head_clean = re.subn(
 2271                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2272                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2273                            head_clean,
 2274                            2,
 2275                        )[0]
 2276                        # Write header
 2277                        header_list_clean.append(head_clean)
 2278                    header_list = header_list_clean
 2279
 2280            tmp_header_name = output_file + output_file_ext
 2281
 2282            f = open(tmp_header_name, "w")
 2283            for line in header_list:
 2284                f.write(line)
 2285            f.close()
 2286
 2287        return tmp_header_name
 2288
 2289    def export_variant_vcf(
 2290        self,
 2291        vcf_file,
 2292        remove_info: bool = False,
 2293        add_samples: bool = True,
 2294        list_samples: list = [],
 2295        where_clause: str = "",
 2296        index: bool = False,
 2297        threads: int | None = None,
 2298    ) -> bool | None:
 2299        """
 2300        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2301        remove INFO field, add samples, and control compression and indexing.
 2302
 2303        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2304        written to. It is the output file that will contain the filtered VCF data based on the specified
 2305        parameters
 2306        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2307        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2308        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2309        in, defaults to False
 2310        :type remove_info: bool (optional)
 2311        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2312        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2313        If set to False, the samples will be removed. The default value is True, defaults to True
 2314        :type add_samples: bool (optional)
 2315        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2316        in the output VCF file. By default, all samples will be included. If you provide a list of
 2317        samples, only those samples will be included in the output file
 2318        :type list_samples: list
 2319        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2320        determines whether or not to create an index for the output VCF file. If `index` is set to
 2321        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2322        :type index: bool (optional)
 2323        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2324        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2325        will be used during the export process. More threads can potentially speed up the export process
 2326        by utilizing multiple cores of the processor. If
 2327        :type threads: int | None
 2328        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2329        method with various parameters including the output file, query, threads, sort flag, and index
 2330        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2331        specified parameters and configurations provided in the `export_variant_vcf` function.
 2332        """
 2333
 2334        # Config
 2335        config = self.get_config()
 2336
 2337        # Extract VCF
 2338        log.debug("Export VCF...")
 2339
 2340        # Table variants
 2341        table_variants = self.get_table_variants()
 2342
 2343        # Threads
 2344        if not threads:
 2345            threads = self.get_threads()
 2346
 2347        # Info fields
 2348        if remove_info:
 2349            if not isinstance(remove_info, str):
 2350                remove_info = "."
 2351            info_field = f"""'{remove_info}' as INFO"""
 2352        else:
 2353            info_field = "INFO"
 2354
 2355        # Samples fields
 2356        if add_samples:
 2357            if not list_samples:
 2358                list_samples = self.get_header_sample_list()
 2359            if list_samples:
 2360                samples_fields = " , FORMAT , " + " , ".join(list_samples)
 2361            else:
 2362                samples_fields = ""
 2363            log.debug(f"samples_fields: {samples_fields}")
 2364        else:
 2365            samples_fields = ""
 2366
 2367        # Where clause
 2368        if where_clause is None:
 2369            where_clause = ""
 2370
 2371        # Variants
 2372        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2373        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2374        log.debug(f"sql_query_select={sql_query_select}")
 2375
 2376        return self.export_output(
 2377            output_file=vcf_file,
 2378            output_header=None,
 2379            export_header=True,
 2380            query=sql_query_select,
 2381            parquet_partitions=None,
 2382            chunk_size=config.get("chunk_size", None),
 2383            threads=threads,
 2384            sort=True,
 2385            index=index,
 2386            order_by=None,
 2387        )
 2388
 2389    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2390        """
 2391        It takes a list of commands and runs them in parallel using the number of threads specified
 2392
 2393        :param commands: A list of commands to run
 2394        :param threads: The number of threads to use, defaults to 1 (optional)
 2395        """
 2396
 2397        run_parallel_commands(commands, threads)
 2398
 2399    def get_threads(self, default: int = 1) -> int:
 2400        """
 2401        This function returns the number of threads to use for a job, with a default value of 1 if not
 2402        specified.
 2403
 2404        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2405        default number of threads to use if no specific value is provided. If no value is provided for
 2406        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2407        used, defaults to 1
 2408        :type default: int (optional)
 2409        :return: the number of threads to use for the current job.
 2410        """
 2411
 2412        # Config
 2413        config = self.get_config()
 2414
 2415        # Param
 2416        param = self.get_param()
 2417
 2418        # Input threads
 2419        input_thread = param.get("threads", config.get("threads", None))
 2420
 2421        # Check threads
 2422        if not input_thread:
 2423            threads = default
 2424        elif int(input_thread) <= 0:
 2425            threads = os.cpu_count()
 2426        else:
 2427            threads = int(input_thread)
 2428        return threads
 2429
 2430    def get_memory(self, default: str = None) -> str:
 2431        """
 2432        This function retrieves the memory value from parameters or configuration with a default value
 2433        if not found.
 2434
 2435        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2436        default value is used as a fallback in case the `memory` parameter is not provided in the
 2437        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2438        the function
 2439        :type default: str
 2440        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2441        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2442        return the default value provided as an argument to the function.
 2443        """
 2444
 2445        # Config
 2446        config = self.get_config()
 2447
 2448        # Param
 2449        param = self.get_param()
 2450
 2451        # Input threads
 2452        input_memory = param.get("memory", config.get("memory", None))
 2453
 2454        # Check threads
 2455        if input_memory:
 2456            memory = input_memory
 2457        else:
 2458            memory = default
 2459
 2460        return memory
 2461
 2462    def update_from_vcf(self, vcf_file: str) -> None:
 2463        """
 2464        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2465
 2466        :param vcf_file: the path to the VCF file
 2467        """
 2468
 2469        connexion_format = self.get_connexion_format()
 2470
 2471        if connexion_format in ["duckdb"]:
 2472            self.update_from_vcf_duckdb(vcf_file)
 2473        elif connexion_format in ["sqlite"]:
 2474            self.update_from_vcf_sqlite(vcf_file)
 2475
 2476    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2477        """
 2478        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2479        INFO column of the VCF file
 2480
 2481        :param vcf_file: the path to the VCF file
 2482        """
 2483
 2484        # varaints table
 2485        table_variants = self.get_table_variants()
 2486
 2487        # Loading VCF into temporaire table
 2488        skip = self.get_header_length(file=vcf_file)
 2489        vcf_df = pd.read_csv(
 2490            vcf_file,
 2491            sep="\t",
 2492            engine="c",
 2493            skiprows=skip,
 2494            header=0,
 2495            low_memory=False,
 2496        )
 2497        sql_query_update = f"""
 2498        UPDATE {table_variants} as table_variants
 2499            SET INFO = concat(
 2500                            CASE
 2501                                WHEN INFO NOT IN ('', '.')
 2502                                THEN INFO
 2503                                ELSE ''
 2504                            END,
 2505                            (
 2506                                SELECT 
 2507                                    concat(
 2508                                        CASE
 2509                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2510                                            THEN ';'
 2511                                            ELSE ''
 2512                                        END
 2513                                        ,
 2514                                        CASE
 2515                                            WHEN table_parquet.INFO NOT IN ('','.')
 2516                                            THEN table_parquet.INFO
 2517                                            ELSE ''
 2518                                        END
 2519                                    )
 2520                                FROM vcf_df as table_parquet
 2521                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2522                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2523                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2524                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2525                                        AND table_parquet.INFO NOT IN ('','.')
 2526                            )
 2527                        )
 2528            ;
 2529            """
 2530        self.conn.execute(sql_query_update)
 2531
 2532    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2533        """
 2534        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2535        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2536        table
 2537
 2538        :param vcf_file: The path to the VCF file you want to update the database with
 2539        """
 2540
 2541        # Create a temporary table for the VCF
 2542        table_vcf = "tmp_vcf"
 2543        sql_create = (
 2544            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2545        )
 2546        self.conn.execute(sql_create)
 2547
 2548        # Loading VCF into temporaire table
 2549        vcf_df = pd.read_csv(
 2550            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2551        )
 2552        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2553        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2554
 2555        # Update table 'variants' with VCF data
 2556        # warning: CONCAT as || operator
 2557        sql_query_update = f"""
 2558            UPDATE variants as table_variants
 2559            SET INFO = CASE
 2560                            WHEN INFO NOT IN ('', '.')
 2561                            THEN INFO
 2562                            ELSE ''
 2563                        END ||
 2564                        (
 2565                        SELECT 
 2566                            CASE 
 2567                                WHEN table_variants.INFO NOT IN ('','.') 
 2568                                    AND table_vcf.INFO NOT IN ('','.')  
 2569                                THEN ';' 
 2570                                ELSE '' 
 2571                            END || 
 2572                            CASE 
 2573                                WHEN table_vcf.INFO NOT IN ('','.') 
 2574                                THEN table_vcf.INFO 
 2575                                ELSE '' 
 2576                            END
 2577                        FROM {table_vcf} as table_vcf
 2578                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2579                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2580                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2581                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2582                        )
 2583        """
 2584        self.conn.execute(sql_query_update)
 2585
 2586        # Drop temporary table
 2587        sql_drop = f"DROP TABLE {table_vcf}"
 2588        self.conn.execute(sql_drop)
 2589
 2590    def drop_variants_table(self) -> None:
 2591        """
 2592        > This function drops the variants table
 2593        """
 2594
 2595        table_variants = self.get_table_variants()
 2596        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2597        self.conn.execute(sql_table_variants)
 2598
 2599    def set_variant_id(
 2600        self, variant_id_column: str = "variant_id", force: bool = None
 2601    ) -> str:
 2602        """
 2603        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2604        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2605
 2606        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2607        to variant_id
 2608        :type variant_id_column: str (optional)
 2609        :param force: If True, the variant_id column will be created even if it already exists
 2610        :type force: bool
 2611        :return: The name of the column that contains the variant_id
 2612        """
 2613
 2614        # Assembly
 2615        assembly = self.get_param().get(
 2616            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2617        )
 2618
 2619        # INFO/Tag prefix
 2620        prefix = self.get_explode_infos_prefix()
 2621
 2622        # Explode INFO/SVTYPE
 2623        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2624
 2625        # variants table
 2626        table_variants = self.get_table_variants()
 2627
 2628        # variant_id column
 2629        if not variant_id_column:
 2630            variant_id_column = "variant_id"
 2631
 2632        # Creta variant_id column
 2633        if "variant_id" not in self.get_extra_infos() or force:
 2634
 2635            # Create column
 2636            self.add_column(
 2637                table_name=table_variants,
 2638                column_name=variant_id_column,
 2639                column_type="UBIGINT",
 2640                default_value="0",
 2641            )
 2642
 2643            # Update column
 2644            self.conn.execute(
 2645                f"""
 2646                    UPDATE {table_variants}
 2647                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2648                """
 2649            )
 2650
 2651        # Remove added columns
 2652        for added_column in added_columns:
 2653            self.drop_column(column=added_column)
 2654
 2655        # return variant_id column name
 2656        return variant_id_column
 2657
 2658    def get_variant_id_column(
 2659        self, variant_id_column: str = "variant_id", force: bool = None
 2660    ) -> str:
 2661        """
 2662        This function returns the variant_id column name
 2663
 2664        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2665        defaults to variant_id
 2666        :type variant_id_column: str (optional)
 2667        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2668        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2669        if it is not already set, or if it is set
 2670        :type force: bool
 2671        :return: The variant_id column name.
 2672        """
 2673
 2674        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2675
 2676    ###
 2677    # Annotation
 2678    ###
 2679
 2680    def scan_databases(
 2681        self,
 2682        database_formats: list = ["parquet"],
 2683        database_releases: list = ["current"],
 2684    ) -> dict:
 2685        """
 2686        The function `scan_databases` scans for available databases based on specified formats and
 2687        releases.
 2688
 2689        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2690        of the databases to be scanned. In this case, the accepted format is "parquet"
 2691        :type database_formats: list ["parquet"]
 2692        :param database_releases: The `database_releases` parameter is a list that specifies the
 2693        releases of the databases to be scanned. In the provided function, the default value for
 2694        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2695        databases that are in the "current"
 2696        :type database_releases: list
 2697        :return: The function `scan_databases` returns a dictionary containing information about
 2698        databases that match the specified formats and releases.
 2699        """
 2700
 2701        # Config
 2702        config = self.get_config()
 2703
 2704        # Param
 2705        param = self.get_param()
 2706
 2707        # Param - Assembly
 2708        assembly = param.get("assembly", config.get("assembly", None))
 2709        if not assembly:
 2710            assembly = DEFAULT_ASSEMBLY
 2711            log.warning(f"Default assembly '{assembly}'")
 2712
 2713        # Scan for availabled databases
 2714        log.info(
 2715            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2716        )
 2717        databases_infos_dict = databases_infos(
 2718            database_folder_releases=database_releases,
 2719            database_formats=database_formats,
 2720            assembly=assembly,
 2721            config=config,
 2722        )
 2723        log.info(
 2724            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2725        )
 2726
 2727        return databases_infos_dict
 2728
 2729    def annotation(self) -> None:
 2730        """
 2731        It annotates the VCF file with the annotations specified in the config file.
 2732        """
 2733
 2734        # Config
 2735        config = self.get_config()
 2736
 2737        # Param
 2738        param = self.get_param()
 2739
 2740        # Param - Assembly
 2741        assembly = param.get("assembly", config.get("assembly", None))
 2742        if not assembly:
 2743            assembly = DEFAULT_ASSEMBLY
 2744            log.warning(f"Default assembly '{assembly}'")
 2745
 2746        # annotations databases folders
 2747        annotations_databases = set(
 2748            config.get("folders", {})
 2749            .get("databases", {})
 2750            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2751            + config.get("folders", {})
 2752            .get("databases", {})
 2753            .get("parquet", ["~/howard/databases/parquet/current"])
 2754            + config.get("folders", {})
 2755            .get("databases", {})
 2756            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2757        )
 2758
 2759        # Get param annotations
 2760        if param.get("annotations", None) and isinstance(
 2761            param.get("annotations", None), str
 2762        ):
 2763            log.debug(param.get("annotations", None))
 2764            param_annotation_list = param.get("annotations").split(",")
 2765        else:
 2766            param_annotation_list = []
 2767
 2768        # Each tools param
 2769        if param.get("annotation_parquet", None) != None:
 2770            log.debug(
 2771                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2772            )
 2773            if isinstance(param.get("annotation_parquet", None), list):
 2774                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2775            else:
 2776                param_annotation_list.append(param.get("annotation_parquet"))
 2777        if param.get("annotation_snpsift", None) != None:
 2778            if isinstance(param.get("annotation_snpsift", None), list):
 2779                param_annotation_list.append(
 2780                    "snpsift:"
 2781                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2782                )
 2783            else:
 2784                param_annotation_list.append(
 2785                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2786                )
 2787        if param.get("annotation_snpeff", None) != None:
 2788            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2789        if param.get("annotation_bcftools", None) != None:
 2790            if isinstance(param.get("annotation_bcftools", None), list):
 2791                param_annotation_list.append(
 2792                    "bcftools:"
 2793                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2794                )
 2795            else:
 2796                param_annotation_list.append(
 2797                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2798                )
 2799        if param.get("annotation_annovar", None) != None:
 2800            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2801        if param.get("annotation_exomiser", None) != None:
 2802            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2803        if param.get("annotation_splice", None) != None:
 2804            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2805
 2806        # Merge param annotations list
 2807        param["annotations"] = ",".join(param_annotation_list)
 2808
 2809        # debug
 2810        log.debug(f"param_annotations={param['annotations']}")
 2811
 2812        if param.get("annotations"):
 2813
 2814            # Log
 2815            # log.info("Annotations - Check annotation parameters")
 2816
 2817            if not "annotation" in param:
 2818                param["annotation"] = {}
 2819
 2820            # List of annotations parameters
 2821            annotations_list_input = {}
 2822            if isinstance(param.get("annotations", None), str):
 2823                annotation_file_list = [
 2824                    value for value in param.get("annotations", "").split(",")
 2825                ]
 2826                for annotation_file in annotation_file_list:
 2827                    annotations_list_input[annotation_file] = {"INFO": None}
 2828            else:
 2829                annotations_list_input = param.get("annotations", {})
 2830
 2831            log.info(f"Quick Annotations:")
 2832            for annotation_key in list(annotations_list_input.keys()):
 2833                log.info(f"   {annotation_key}")
 2834
 2835            # List of annotations and associated fields
 2836            annotations_list = {}
 2837
 2838            for annotation_file in annotations_list_input:
 2839
 2840                # Explode annotations if ALL
 2841                if (
 2842                    annotation_file.upper() == "ALL"
 2843                    or annotation_file.upper().startswith("ALL:")
 2844                ):
 2845
 2846                    # check ALL parameters (formats, releases)
 2847                    annotation_file_split = annotation_file.split(":")
 2848                    database_formats = "parquet"
 2849                    database_releases = "current"
 2850                    for annotation_file_option in annotation_file_split[1:]:
 2851                        database_all_options_split = annotation_file_option.split("=")
 2852                        if database_all_options_split[0] == "format":
 2853                            database_formats = database_all_options_split[1].split("+")
 2854                        if database_all_options_split[0] == "release":
 2855                            database_releases = database_all_options_split[1].split("+")
 2856
 2857                    # Scan for availabled databases
 2858                    databases_infos_dict = self.scan_databases(
 2859                        database_formats=database_formats,
 2860                        database_releases=database_releases,
 2861                    )
 2862
 2863                    # Add found databases in annotation parameters
 2864                    for database_infos in databases_infos_dict.keys():
 2865                        annotations_list[database_infos] = {"INFO": None}
 2866
 2867                else:
 2868                    annotations_list[annotation_file] = annotations_list_input[
 2869                        annotation_file
 2870                    ]
 2871
 2872            # Check each databases
 2873            if len(annotations_list):
 2874
 2875                log.info(
 2876                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 2877                )
 2878
 2879                for annotation_file in annotations_list:
 2880
 2881                    # Init
 2882                    annotations = annotations_list.get(annotation_file, None)
 2883
 2884                    # Annotation snpEff
 2885                    if annotation_file.startswith("snpeff"):
 2886
 2887                        log.debug(f"Quick Annotation snpEff")
 2888
 2889                        if "snpeff" not in param["annotation"]:
 2890                            param["annotation"]["snpeff"] = {}
 2891
 2892                        if "options" not in param["annotation"]["snpeff"]:
 2893                            param["annotation"]["snpeff"]["options"] = ""
 2894
 2895                        # snpEff options in annotations
 2896                        param["annotation"]["snpeff"]["options"] = "".join(
 2897                            annotation_file.split(":")[1:]
 2898                        )
 2899
 2900                    # Annotation Annovar
 2901                    elif annotation_file.startswith("annovar"):
 2902
 2903                        log.debug(f"Quick Annotation Annovar")
 2904
 2905                        if "annovar" not in param["annotation"]:
 2906                            param["annotation"]["annovar"] = {}
 2907
 2908                        if "annotations" not in param["annotation"]["annovar"]:
 2909                            param["annotation"]["annovar"]["annotations"] = {}
 2910
 2911                        # Options
 2912                        annotation_file_split = annotation_file.split(":")
 2913                        for annotation_file_annotation in annotation_file_split[1:]:
 2914                            if annotation_file_annotation:
 2915                                param["annotation"]["annovar"]["annotations"][
 2916                                    annotation_file_annotation
 2917                                ] = annotations
 2918
 2919                    # Annotation Exomiser
 2920                    elif annotation_file.startswith("exomiser"):
 2921
 2922                        log.debug(f"Quick Annotation Exomiser")
 2923
 2924                        param["annotation"]["exomiser"] = params_string_to_dict(
 2925                            annotation_file
 2926                        )
 2927
 2928                    # Annotation Splice
 2929                    elif annotation_file.startswith("splice"):
 2930
 2931                        log.debug(f"Quick Annotation Splice")
 2932
 2933                        param["annotation"]["splice"] = params_string_to_dict(
 2934                            annotation_file
 2935                        )
 2936
 2937                    # Annotation Parquet or BCFTOOLS
 2938                    else:
 2939
 2940                        # Tools detection
 2941                        if annotation_file.startswith("bcftools:"):
 2942                            annotation_tool_initial = "bcftools"
 2943                            annotation_file = ":".join(annotation_file.split(":")[1:])
 2944                        elif annotation_file.startswith("snpsift:"):
 2945                            annotation_tool_initial = "snpsift"
 2946                            annotation_file = ":".join(annotation_file.split(":")[1:])
 2947                        else:
 2948                            annotation_tool_initial = None
 2949
 2950                        # list of files
 2951                        annotation_file_list = annotation_file.replace("+", ":").split(
 2952                            ":"
 2953                        )
 2954
 2955                        for annotation_file in annotation_file_list:
 2956
 2957                            if annotation_file:
 2958
 2959                                # Annotation tool initial
 2960                                annotation_tool = annotation_tool_initial
 2961
 2962                                # Find file
 2963                                annotation_file_found = None
 2964
 2965                                # Expand user
 2966                                annotation_file = full_path(annotation_file)
 2967
 2968                                if os.path.exists(annotation_file):
 2969                                    annotation_file_found = annotation_file
 2970
 2971                                else:
 2972                                    # Find within assembly folders
 2973                                    for annotations_database in annotations_databases:
 2974                                        found_files = find_all(
 2975                                            annotation_file,
 2976                                            os.path.join(
 2977                                                annotations_database, assembly
 2978                                            ),
 2979                                        )
 2980                                        if len(found_files) > 0:
 2981                                            annotation_file_found = found_files[0]
 2982                                            break
 2983                                    if not annotation_file_found and not assembly:
 2984                                        # Find within folders
 2985                                        for (
 2986                                            annotations_database
 2987                                        ) in annotations_databases:
 2988                                            found_files = find_all(
 2989                                                annotation_file, annotations_database
 2990                                            )
 2991                                            if len(found_files) > 0:
 2992                                                annotation_file_found = found_files[0]
 2993                                                break
 2994                                log.debug(
 2995                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 2996                                )
 2997
 2998                                # Full path
 2999                                annotation_file_found = full_path(annotation_file_found)
 3000
 3001                                if annotation_file_found:
 3002
 3003                                    database = Database(database=annotation_file_found)
 3004                                    quick_annotation_format = database.get_format()
 3005                                    quick_annotation_is_compressed = (
 3006                                        database.is_compressed()
 3007                                    )
 3008                                    quick_annotation_is_indexed = os.path.exists(
 3009                                        f"{annotation_file_found}.tbi"
 3010                                    )
 3011                                    bcftools_preference = False
 3012
 3013                                    # Check Annotation Tool
 3014                                    if not annotation_tool:
 3015                                        if (
 3016                                            bcftools_preference
 3017                                            and quick_annotation_format
 3018                                            in ["vcf", "bed"]
 3019                                            and quick_annotation_is_compressed
 3020                                            and quick_annotation_is_indexed
 3021                                        ):
 3022                                            annotation_tool = "bcftools"
 3023                                        elif quick_annotation_format in [
 3024                                            "vcf",
 3025                                            "bed",
 3026                                            "tsv",
 3027                                            "tsv",
 3028                                            "csv",
 3029                                            "json",
 3030                                            "tbl",
 3031                                            "parquet",
 3032                                            "duckdb",
 3033                                        ]:
 3034                                            annotation_tool = "parquet"
 3035                                        else:
 3036                                            log.error(
 3037                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3038                                            )
 3039                                            raise ValueError(
 3040                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3041                                            )
 3042
 3043                                    log.debug(
 3044                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3045                                    )
 3046
 3047                                    # Annotation Tool dispatch
 3048                                    if annotation_tool:
 3049                                        if annotation_tool not in param["annotation"]:
 3050                                            param["annotation"][annotation_tool] = {}
 3051                                        if (
 3052                                            "annotations"
 3053                                            not in param["annotation"][annotation_tool]
 3054                                        ):
 3055                                            param["annotation"][annotation_tool][
 3056                                                "annotations"
 3057                                            ] = {}
 3058                                        param["annotation"][annotation_tool][
 3059                                            "annotations"
 3060                                        ][annotation_file_found] = annotations
 3061
 3062                                else:
 3063                                    log.error(
 3064                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3065                                    )
 3066
 3067                self.set_param(param)
 3068
 3069        if param.get("annotation", None):
 3070            log.info("Annotations")
 3071            if param.get("annotation", {}).get("parquet", None):
 3072                log.info("Annotations 'parquet'...")
 3073                self.annotation_parquet()
 3074            if param.get("annotation", {}).get("bcftools", None):
 3075                log.info("Annotations 'bcftools'...")
 3076                self.annotation_bcftools()
 3077            if param.get("annotation", {}).get("snpsift", None):
 3078                log.info("Annotations 'snpsift'...")
 3079                self.annotation_snpsift()
 3080            if param.get("annotation", {}).get("annovar", None):
 3081                log.info("Annotations 'annovar'...")
 3082                self.annotation_annovar()
 3083            if param.get("annotation", {}).get("snpeff", None):
 3084                log.info("Annotations 'snpeff'...")
 3085                self.annotation_snpeff()
 3086            if param.get("annotation", {}).get("exomiser", None) is not None:
 3087                log.info("Annotations 'exomiser'...")
 3088                self.annotation_exomiser()
 3089            if param.get("annotation", {}).get("splice", None) is not None:
 3090                log.info("Annotations 'splice' ...")
 3091                self.annotation_splice()
 3092
 3093        # Explode INFOS fields into table fields
 3094        if self.get_explode_infos():
 3095            self.explode_infos(
 3096                prefix=self.get_explode_infos_prefix(),
 3097                fields=self.get_explode_infos_fields(),
 3098                force=True,
 3099            )
 3100
 3101    def annotation_snpsift(self, threads: int = None) -> None:
 3102        """
 3103        This function annotate with bcftools
 3104
 3105        :param threads: Number of threads to use
 3106        :return: the value of the variable "return_value".
 3107        """
 3108
 3109        # DEBUG
 3110        log.debug("Start annotation with bcftools databases")
 3111
 3112        # Threads
 3113        if not threads:
 3114            threads = self.get_threads()
 3115        log.debug("Threads: " + str(threads))
 3116
 3117        # Config
 3118        config = self.get_config()
 3119        log.debug("Config: " + str(config))
 3120
 3121        # Config - snpSift
 3122        snpsift_bin_command = get_bin_command(
 3123            bin="SnpSift.jar",
 3124            tool="snpsift",
 3125            bin_type="jar",
 3126            config=config,
 3127            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3128        )
 3129        if not snpsift_bin_command:
 3130            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3131            log.error(msg_err)
 3132            raise ValueError(msg_err)
 3133
 3134        # Config - bcftools
 3135        bcftools_bin_command = get_bin_command(
 3136            bin="bcftools",
 3137            tool="bcftools",
 3138            bin_type="bin",
 3139            config=config,
 3140            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3141        )
 3142        if not bcftools_bin_command:
 3143            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3144            log.error(msg_err)
 3145            raise ValueError(msg_err)
 3146
 3147        # Config - BCFTools databases folders
 3148        databases_folders = set(
 3149            self.get_config()
 3150            .get("folders", {})
 3151            .get("databases", {})
 3152            .get("annotations", ["."])
 3153            + self.get_config()
 3154            .get("folders", {})
 3155            .get("databases", {})
 3156            .get("bcftools", ["."])
 3157        )
 3158        log.debug("Databases annotations: " + str(databases_folders))
 3159
 3160        # Param
 3161        annotations = (
 3162            self.get_param()
 3163            .get("annotation", {})
 3164            .get("snpsift", {})
 3165            .get("annotations", None)
 3166        )
 3167        log.debug("Annotations: " + str(annotations))
 3168
 3169        # Assembly
 3170        assembly = self.get_param().get(
 3171            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3172        )
 3173
 3174        # Data
 3175        table_variants = self.get_table_variants()
 3176
 3177        # Check if not empty
 3178        log.debug("Check if not empty")
 3179        sql_query_chromosomes = (
 3180            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3181        )
 3182        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3183        if not sql_query_chromosomes_df["count"][0]:
 3184            log.info(f"VCF empty")
 3185            return
 3186
 3187        # VCF header
 3188        vcf_reader = self.get_header()
 3189        log.debug("Initial header: " + str(vcf_reader.infos))
 3190
 3191        # Existing annotations
 3192        for vcf_annotation in self.get_header().infos:
 3193
 3194            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3195            log.debug(
 3196                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3197            )
 3198
 3199        if annotations:
 3200
 3201            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3202
 3203                # Export VCF file
 3204                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3205
 3206                # Init
 3207                commands = {}
 3208
 3209                for annotation in annotations:
 3210                    annotation_fields = annotations[annotation]
 3211
 3212                    # Annotation Name
 3213                    annotation_name = os.path.basename(annotation)
 3214
 3215                    if not annotation_fields:
 3216                        annotation_fields = {"INFO": None}
 3217
 3218                    log.debug(f"Annotation '{annotation_name}'")
 3219                    log.debug(
 3220                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3221                    )
 3222
 3223                    # Create Database
 3224                    database = Database(
 3225                        database=annotation,
 3226                        databases_folders=databases_folders,
 3227                        assembly=assembly,
 3228                    )
 3229
 3230                    # Find files
 3231                    db_file = database.get_database()
 3232                    db_file = full_path(db_file)
 3233                    db_hdr_file = database.get_header_file()
 3234                    db_hdr_file = full_path(db_hdr_file)
 3235                    db_file_type = database.get_format()
 3236                    db_tbi_file = f"{db_file}.tbi"
 3237                    db_file_compressed = database.is_compressed()
 3238
 3239                    # Check if compressed
 3240                    if not db_file_compressed:
 3241                        log.error(
 3242                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3243                        )
 3244                        raise ValueError(
 3245                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3246                        )
 3247
 3248                    # Check if indexed
 3249                    if not os.path.exists(db_tbi_file):
 3250                        log.error(
 3251                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3252                        )
 3253                        raise ValueError(
 3254                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3255                        )
 3256
 3257                    # Check index - try to create if not exists
 3258                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3259                        log.error("Annotation failed: database not valid")
 3260                        log.error(f"Annotation annotation file: {db_file}")
 3261                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3262                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3263                        raise ValueError(
 3264                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3265                        )
 3266                    else:
 3267
 3268                        log.debug(
 3269                            f"Annotation '{annotation}' - file: "
 3270                            + str(db_file)
 3271                            + " and "
 3272                            + str(db_hdr_file)
 3273                        )
 3274
 3275                        # Load header as VCF object
 3276                        db_hdr_vcf = Variants(input=db_hdr_file)
 3277                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3278                        log.debug(
 3279                            "Annotation database header: "
 3280                            + str(db_hdr_vcf_header_infos)
 3281                        )
 3282
 3283                        # For all fields in database
 3284                        annotation_fields_full = False
 3285                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3286                            annotation_fields = {
 3287                                key: key for key in db_hdr_vcf_header_infos
 3288                            }
 3289                            log.debug(
 3290                                "Annotation database header - All annotations added: "
 3291                                + str(annotation_fields)
 3292                            )
 3293                            annotation_fields_full = True
 3294
 3295                        # # Create file for field rename
 3296                        # log.debug("Create file for field rename")
 3297                        # tmp_rename = NamedTemporaryFile(
 3298                        #     prefix=self.get_prefix(),
 3299                        #     dir=self.get_tmp_dir(),
 3300                        #     suffix=".rename",
 3301                        #     delete=False,
 3302                        # )
 3303                        # tmp_rename_name = tmp_rename.name
 3304                        # tmp_files.append(tmp_rename_name)
 3305
 3306                        # Number of fields
 3307                        nb_annotation_field = 0
 3308                        annotation_list = []
 3309                        annotation_infos_rename_list = []
 3310
 3311                        for annotation_field in annotation_fields:
 3312
 3313                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3314                            annotation_fields_new_name = annotation_fields.get(
 3315                                annotation_field, annotation_field
 3316                            )
 3317                            if not annotation_fields_new_name:
 3318                                annotation_fields_new_name = annotation_field
 3319
 3320                            # Check if field is in DB and if field is not elready in input data
 3321                            if (
 3322                                annotation_field in db_hdr_vcf.get_header().infos
 3323                                and annotation_fields_new_name
 3324                                not in self.get_header().infos
 3325                            ):
 3326
 3327                                log.info(
 3328                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3329                                )
 3330
 3331                                # BCFTools annotate param to rename fields
 3332                                if annotation_field != annotation_fields_new_name:
 3333                                    annotation_infos_rename_list.append(
 3334                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3335                                    )
 3336
 3337                                # Add INFO field to header
 3338                                db_hdr_vcf_header_infos_number = (
 3339                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3340                                )
 3341                                db_hdr_vcf_header_infos_type = (
 3342                                    db_hdr_vcf_header_infos[annotation_field].type
 3343                                    or "String"
 3344                                )
 3345                                db_hdr_vcf_header_infos_description = (
 3346                                    db_hdr_vcf_header_infos[annotation_field].desc
 3347                                    or f"{annotation_field} description"
 3348                                )
 3349                                db_hdr_vcf_header_infos_source = (
 3350                                    db_hdr_vcf_header_infos[annotation_field].source
 3351                                    or "unknown"
 3352                                )
 3353                                db_hdr_vcf_header_infos_version = (
 3354                                    db_hdr_vcf_header_infos[annotation_field].version
 3355                                    or "unknown"
 3356                                )
 3357
 3358                                vcf_reader.infos[annotation_fields_new_name] = (
 3359                                    vcf.parser._Info(
 3360                                        annotation_fields_new_name,
 3361                                        db_hdr_vcf_header_infos_number,
 3362                                        db_hdr_vcf_header_infos_type,
 3363                                        db_hdr_vcf_header_infos_description,
 3364                                        db_hdr_vcf_header_infos_source,
 3365                                        db_hdr_vcf_header_infos_version,
 3366                                        self.code_type_map[
 3367                                            db_hdr_vcf_header_infos_type
 3368                                        ],
 3369                                    )
 3370                                )
 3371
 3372                                annotation_list.append(annotation_field)
 3373
 3374                                nb_annotation_field += 1
 3375
 3376                            else:
 3377
 3378                                if (
 3379                                    annotation_field
 3380                                    not in db_hdr_vcf.get_header().infos
 3381                                ):
 3382                                    log.warning(
 3383                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3384                                    )
 3385                                if (
 3386                                    annotation_fields_new_name
 3387                                    in self.get_header().infos
 3388                                ):
 3389                                    log.warning(
 3390                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3391                                    )
 3392
 3393                        log.info(
 3394                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3395                        )
 3396
 3397                        annotation_infos = ",".join(annotation_list)
 3398
 3399                        if annotation_infos != "":
 3400
 3401                            # Annotated VCF (and error file)
 3402                            tmp_annotation_vcf_name = os.path.join(
 3403                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3404                            )
 3405                            tmp_annotation_vcf_name_err = (
 3406                                tmp_annotation_vcf_name + ".err"
 3407                            )
 3408
 3409                            # Add fields to annotate
 3410                            if not annotation_fields_full:
 3411                                annotation_infos_option = f"-info {annotation_infos}"
 3412                            else:
 3413                                annotation_infos_option = ""
 3414
 3415                            # Info fields rename
 3416                            if annotation_infos_rename_list:
 3417                                annotation_infos_rename = " -c " + ",".join(
 3418                                    annotation_infos_rename_list
 3419                                )
 3420                            else:
 3421                                annotation_infos_rename = ""
 3422
 3423                            # Annotate command
 3424                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3425
 3426                            # Add command
 3427                            commands[command_annotate] = tmp_annotation_vcf_name
 3428
 3429                if commands:
 3430
 3431                    # Export VCF file
 3432                    self.export_variant_vcf(
 3433                        vcf_file=tmp_vcf_name,
 3434                        remove_info=True,
 3435                        add_samples=False,
 3436                        index=True,
 3437                    )
 3438                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3439
 3440                    # Num command
 3441                    nb_command = 0
 3442
 3443                    # Annotate
 3444                    for command_annotate in commands:
 3445                        nb_command += 1
 3446                        log.info(
 3447                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3448                        )
 3449                        log.debug(f"command_annotate={command_annotate}")
 3450                        run_parallel_commands([command_annotate], threads)
 3451
 3452                        # Debug
 3453                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3454
 3455                        # Update variants
 3456                        log.info(
 3457                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3458                        )
 3459                        self.update_from_vcf(commands[command_annotate])
 3460
 3461    def annotation_bcftools(self, threads: int = None) -> None:
 3462        """
 3463        This function annotate with bcftools
 3464
 3465        :param threads: Number of threads to use
 3466        :return: the value of the variable "return_value".
 3467        """
 3468
 3469        # DEBUG
 3470        log.debug("Start annotation with bcftools databases")
 3471
 3472        # Threads
 3473        if not threads:
 3474            threads = self.get_threads()
 3475        log.debug("Threads: " + str(threads))
 3476
 3477        # Config
 3478        config = self.get_config()
 3479        log.debug("Config: " + str(config))
 3480
 3481        # DEBUG
 3482        delete_tmp = True
 3483        if self.get_config().get("verbosity", "warning") in ["debug"]:
 3484            delete_tmp = False
 3485            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 3486
 3487        # Config - BCFTools bin command
 3488        bcftools_bin_command = get_bin_command(
 3489            bin="bcftools",
 3490            tool="bcftools",
 3491            bin_type="bin",
 3492            config=config,
 3493            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3494        )
 3495        if not bcftools_bin_command:
 3496            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3497            log.error(msg_err)
 3498            raise ValueError(msg_err)
 3499
 3500        # Config - BCFTools databases folders
 3501        databases_folders = set(
 3502            self.get_config()
 3503            .get("folders", {})
 3504            .get("databases", {})
 3505            .get("annotations", ["."])
 3506            + self.get_config()
 3507            .get("folders", {})
 3508            .get("databases", {})
 3509            .get("bcftools", ["."])
 3510        )
 3511        log.debug("Databases annotations: " + str(databases_folders))
 3512
 3513        # Param
 3514        annotations = (
 3515            self.get_param()
 3516            .get("annotation", {})
 3517            .get("bcftools", {})
 3518            .get("annotations", None)
 3519        )
 3520        log.debug("Annotations: " + str(annotations))
 3521
 3522        # Assembly
 3523        assembly = self.get_param().get(
 3524            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3525        )
 3526
 3527        # Data
 3528        table_variants = self.get_table_variants()
 3529
 3530        # Check if not empty
 3531        log.debug("Check if not empty")
 3532        sql_query_chromosomes = (
 3533            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3534        )
 3535        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3536        if not sql_query_chromosomes_df["count"][0]:
 3537            log.info(f"VCF empty")
 3538            return
 3539
 3540        # Export in VCF
 3541        log.debug("Create initial file to annotate")
 3542        tmp_vcf = NamedTemporaryFile(
 3543            prefix=self.get_prefix(),
 3544            dir=self.get_tmp_dir(),
 3545            suffix=".vcf.gz",
 3546            delete=False,
 3547        )
 3548        tmp_vcf_name = tmp_vcf.name
 3549
 3550        # VCF header
 3551        vcf_reader = self.get_header()
 3552        log.debug("Initial header: " + str(vcf_reader.infos))
 3553
 3554        # Existing annotations
 3555        for vcf_annotation in self.get_header().infos:
 3556
 3557            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3558            log.debug(
 3559                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3560            )
 3561
 3562        if annotations:
 3563
 3564            tmp_ann_vcf_list = []
 3565            commands = []
 3566            tmp_files = []
 3567            err_files = []
 3568
 3569            for annotation in annotations:
 3570                annotation_fields = annotations[annotation]
 3571
 3572                # Annotation Name
 3573                annotation_name = os.path.basename(annotation)
 3574
 3575                if not annotation_fields:
 3576                    annotation_fields = {"INFO": None}
 3577
 3578                log.debug(f"Annotation '{annotation_name}'")
 3579                log.debug(
 3580                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3581                )
 3582
 3583                # Create Database
 3584                database = Database(
 3585                    database=annotation,
 3586                    databases_folders=databases_folders,
 3587                    assembly=assembly,
 3588                )
 3589
 3590                # Find files
 3591                db_file = database.get_database()
 3592                db_file = full_path(db_file)
 3593                db_hdr_file = database.get_header_file()
 3594                db_hdr_file = full_path(db_hdr_file)
 3595                db_file_type = database.get_format()
 3596                db_tbi_file = f"{db_file}.tbi"
 3597                db_file_compressed = database.is_compressed()
 3598
 3599                # Check if compressed
 3600                if not db_file_compressed:
 3601                    log.error(
 3602                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3603                    )
 3604                    raise ValueError(
 3605                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3606                    )
 3607
 3608                # Check if indexed
 3609                if not os.path.exists(db_tbi_file):
 3610                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 3611                    raise ValueError(
 3612                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3613                    )
 3614
 3615                # Check index - try to create if not exists
 3616                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3617                    log.error("Annotation failed: database not valid")
 3618                    log.error(f"Annotation annotation file: {db_file}")
 3619                    log.error(f"Annotation annotation header: {db_hdr_file}")
 3620                    log.error(f"Annotation annotation index: {db_tbi_file}")
 3621                    raise ValueError(
 3622                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3623                    )
 3624                else:
 3625
 3626                    log.debug(
 3627                        f"Annotation '{annotation}' - file: "
 3628                        + str(db_file)
 3629                        + " and "
 3630                        + str(db_hdr_file)
 3631                    )
 3632
 3633                    # Load header as VCF object
 3634                    db_hdr_vcf = Variants(input=db_hdr_file)
 3635                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3636                    log.debug(
 3637                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 3638                    )
 3639
 3640                    # For all fields in database
 3641                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3642                        annotation_fields = {
 3643                            key: key for key in db_hdr_vcf_header_infos
 3644                        }
 3645                        log.debug(
 3646                            "Annotation database header - All annotations added: "
 3647                            + str(annotation_fields)
 3648                        )
 3649
 3650                    # Number of fields
 3651                    nb_annotation_field = 0
 3652                    annotation_list = []
 3653
 3654                    for annotation_field in annotation_fields:
 3655
 3656                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3657                        annotation_fields_new_name = annotation_fields.get(
 3658                            annotation_field, annotation_field
 3659                        )
 3660                        if not annotation_fields_new_name:
 3661                            annotation_fields_new_name = annotation_field
 3662
 3663                        # Check if field is in DB and if field is not elready in input data
 3664                        if (
 3665                            annotation_field in db_hdr_vcf.get_header().infos
 3666                            and annotation_fields_new_name
 3667                            not in self.get_header().infos
 3668                        ):
 3669
 3670                            log.info(
 3671                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3672                            )
 3673
 3674                            # Add INFO field to header
 3675                            db_hdr_vcf_header_infos_number = (
 3676                                db_hdr_vcf_header_infos[annotation_field].num or "."
 3677                            )
 3678                            db_hdr_vcf_header_infos_type = (
 3679                                db_hdr_vcf_header_infos[annotation_field].type
 3680                                or "String"
 3681                            )
 3682                            db_hdr_vcf_header_infos_description = (
 3683                                db_hdr_vcf_header_infos[annotation_field].desc
 3684                                or f"{annotation_field} description"
 3685                            )
 3686                            db_hdr_vcf_header_infos_source = (
 3687                                db_hdr_vcf_header_infos[annotation_field].source
 3688                                or "unknown"
 3689                            )
 3690                            db_hdr_vcf_header_infos_version = (
 3691                                db_hdr_vcf_header_infos[annotation_field].version
 3692                                or "unknown"
 3693                            )
 3694
 3695                            vcf_reader.infos[annotation_fields_new_name] = (
 3696                                vcf.parser._Info(
 3697                                    annotation_fields_new_name,
 3698                                    db_hdr_vcf_header_infos_number,
 3699                                    db_hdr_vcf_header_infos_type,
 3700                                    db_hdr_vcf_header_infos_description,
 3701                                    db_hdr_vcf_header_infos_source,
 3702                                    db_hdr_vcf_header_infos_version,
 3703                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 3704                                )
 3705                            )
 3706
 3707                            # annotation_list.append(annotation_field)
 3708                            if annotation_field != annotation_fields_new_name:
 3709                                annotation_list.append(
 3710                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3711                                )
 3712                            else:
 3713                                annotation_list.append(annotation_field)
 3714
 3715                            nb_annotation_field += 1
 3716
 3717                        else:
 3718
 3719                            if annotation_field not in db_hdr_vcf.get_header().infos:
 3720                                log.warning(
 3721                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 3722                                )
 3723                            if annotation_fields_new_name in self.get_header().infos:
 3724                                log.warning(
 3725                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3726                                )
 3727
 3728                    log.info(
 3729                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3730                    )
 3731
 3732                    annotation_infos = ",".join(annotation_list)
 3733
 3734                    if annotation_infos != "":
 3735
 3736                        # Protect header for bcftools (remove "#CHROM" and variants line)
 3737                        log.debug("Protect Header file - remove #CHROM line if exists")
 3738                        tmp_header_vcf = NamedTemporaryFile(
 3739                            prefix=self.get_prefix(),
 3740                            dir=self.get_tmp_dir(),
 3741                            suffix=".hdr",
 3742                            delete=False,
 3743                        )
 3744                        tmp_header_vcf_name = tmp_header_vcf.name
 3745                        tmp_files.append(tmp_header_vcf_name)
 3746                        # Command
 3747                        if db_hdr_file.endswith(".gz"):
 3748                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 3749                        else:
 3750                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 3751                        # Run
 3752                        run_parallel_commands([command_extract_header], 1)
 3753
 3754                        # Find chomosomes
 3755                        log.debug("Find chromosomes ")
 3756                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 3757                        sql_query_chromosomes_df = self.get_query_to_df(
 3758                            sql_query_chromosomes
 3759                        )
 3760                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 3761
 3762                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 3763
 3764                        # BED columns in the annotation file
 3765                        if db_file_type in ["bed"]:
 3766                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 3767
 3768                        for chrom in chomosomes_list:
 3769
 3770                            # Create BED on initial VCF
 3771                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 3772                            tmp_bed = NamedTemporaryFile(
 3773                                prefix=self.get_prefix(),
 3774                                dir=self.get_tmp_dir(),
 3775                                suffix=".bed",
 3776                                delete=False,
 3777                            )
 3778                            tmp_bed_name = tmp_bed.name
 3779                            tmp_files.append(tmp_bed_name)
 3780
 3781                            # Detecte regions
 3782                            log.debug(
 3783                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 3784                            )
 3785                            window = 1000000
 3786                            sql_query_intervals_for_bed = f"""
 3787                                SELECT  \"#CHROM\",
 3788                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 3789                                        \"POS\"+{window}
 3790                                FROM {table_variants} as table_variants
 3791                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 3792                            """
 3793                            regions = self.conn.execute(
 3794                                sql_query_intervals_for_bed
 3795                            ).fetchall()
 3796                            merged_regions = merge_regions(regions)
 3797                            log.debug(
 3798                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 3799                            )
 3800
 3801                            header = ["#CHROM", "START", "END"]
 3802                            with open(tmp_bed_name, "w") as f:
 3803                                # Write the header with tab delimiter
 3804                                f.write("\t".join(header) + "\n")
 3805                                for d in merged_regions:
 3806                                    # Write each data row with tab delimiter
 3807                                    f.write("\t".join(map(str, d)) + "\n")
 3808
 3809                            # Tmp files
 3810                            tmp_annotation_vcf = NamedTemporaryFile(
 3811                                prefix=self.get_prefix(),
 3812                                dir=self.get_tmp_dir(),
 3813                                suffix=".vcf.gz",
 3814                                delete=False,
 3815                            )
 3816                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 3817                            tmp_files.append(tmp_annotation_vcf_name)
 3818                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 3819                            tmp_annotation_vcf_name_err = (
 3820                                tmp_annotation_vcf_name + ".err"
 3821                            )
 3822                            err_files.append(tmp_annotation_vcf_name_err)
 3823
 3824                            # Annotate Command
 3825                            log.debug(
 3826                                f"Annotation '{annotation}' - add bcftools command"
 3827                            )
 3828
 3829                            # Command
 3830                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3831
 3832                            # Add command
 3833                            commands.append(command_annotate)
 3834
 3835            # if some commands
 3836            if commands:
 3837
 3838                # Export VCF file
 3839                self.export_variant_vcf(
 3840                    vcf_file=tmp_vcf_name,
 3841                    remove_info=True,
 3842                    add_samples=False,
 3843                    index=True,
 3844                )
 3845
 3846                # Threads
 3847                # calculate threads for annotated commands
 3848                if commands:
 3849                    threads_bcftools_annotate = round(threads / len(commands))
 3850                else:
 3851                    threads_bcftools_annotate = 1
 3852
 3853                if not threads_bcftools_annotate:
 3854                    threads_bcftools_annotate = 1
 3855
 3856                # Add threads option to bcftools commands
 3857                if threads_bcftools_annotate > 1:
 3858                    commands_threaded = []
 3859                    for command in commands:
 3860                        commands_threaded.append(
 3861                            command.replace(
 3862                                f"{bcftools_bin_command} annotate ",
 3863                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 3864                            )
 3865                        )
 3866                    commands = commands_threaded
 3867
 3868                # Command annotation multithreading
 3869                log.debug(f"Annotation - Annotation commands: " + str(commands))
 3870                log.info(
 3871                    f"Annotation - Annotation multithreaded in "
 3872                    + str(len(commands))
 3873                    + " commands"
 3874                )
 3875
 3876                run_parallel_commands(commands, threads)
 3877
 3878                # Merge
 3879                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 3880
 3881                if tmp_ann_vcf_list_cmd:
 3882
 3883                    # Tmp file
 3884                    tmp_annotate_vcf = NamedTemporaryFile(
 3885                        prefix=self.get_prefix(),
 3886                        dir=self.get_tmp_dir(),
 3887                        suffix=".vcf.gz",
 3888                        delete=True,
 3889                    )
 3890                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 3891                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 3892                    err_files.append(tmp_annotate_vcf_name_err)
 3893
 3894                    # Tmp file remove command
 3895                    tmp_files_remove_command = ""
 3896                    if tmp_files:
 3897                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 3898
 3899                    # Command merge
 3900                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 3901                    log.info(
 3902                        f"Annotation - Annotation merging "
 3903                        + str(len(commands))
 3904                        + " annotated files"
 3905                    )
 3906                    log.debug(f"Annotation - merge command: {merge_command}")
 3907                    run_parallel_commands([merge_command], 1)
 3908
 3909                    # Error messages
 3910                    log.info(f"Error/Warning messages:")
 3911                    error_message_command_all = []
 3912                    error_message_command_warning = []
 3913                    error_message_command_err = []
 3914                    for err_file in err_files:
 3915                        with open(err_file, "r") as f:
 3916                            for line in f:
 3917                                message = line.strip()
 3918                                error_message_command_all.append(message)
 3919                                if line.startswith("[W::"):
 3920                                    error_message_command_warning.append(message)
 3921                                if line.startswith("[E::"):
 3922                                    error_message_command_err.append(
 3923                                        f"{err_file}: " + message
 3924                                    )
 3925                    # log info
 3926                    for message in list(
 3927                        set(error_message_command_err + error_message_command_warning)
 3928                    ):
 3929                        log.info(f"   {message}")
 3930                    # debug info
 3931                    for message in list(set(error_message_command_all)):
 3932                        log.debug(f"   {message}")
 3933                    # failed
 3934                    if len(error_message_command_err):
 3935                        log.error("Annotation failed: Error in commands")
 3936                        raise ValueError("Annotation failed: Error in commands")
 3937
 3938                    # Update variants
 3939                    log.info(f"Annotation - Updating...")
 3940                    self.update_from_vcf(tmp_annotate_vcf_name)
 3941
 3942    def annotation_exomiser(self, threads: int = None) -> None:
 3943        """
 3944        This function annotate with Exomiser
 3945
 3946        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 3947        - "analysis" (dict/file):
 3948            Full analysis dictionnary parameters (see Exomiser docs).
 3949            Either a dict, or a file in JSON or YAML format.
 3950            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 3951            Default : None
 3952        - "preset" (string):
 3953            Analysis preset (available in config folder).
 3954            Used if no full "analysis" is provided.
 3955            Default: "exome"
 3956        - "phenopacket" (dict/file):
 3957            Samples and phenotipic features parameters (see Exomiser docs).
 3958            Either a dict, or a file in JSON or YAML format.
 3959            Default: None
 3960        - "subject" (dict):
 3961            Sample parameters (see Exomiser docs).
 3962            Example:
 3963                "subject":
 3964                    {
 3965                        "id": "ISDBM322017",
 3966                        "sex": "FEMALE"
 3967                    }
 3968            Default: None
 3969        - "sample" (string):
 3970            Sample name to construct "subject" section:
 3971                "subject":
 3972                    {
 3973                        "id": "<sample>",
 3974                        "sex": "UNKNOWN_SEX"
 3975                    }
 3976            Default: None
 3977        - "phenotypicFeatures" (dict)
 3978            Phenotypic features to construct "subject" section.
 3979            Example:
 3980                "phenotypicFeatures":
 3981                    [
 3982                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 3983                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 3984                    ]
 3985        - "hpo" (list)
 3986            List of HPO ids as phenotypic features.
 3987            Example:
 3988                "hpo": ['0001156', '0001363', '0011304', '0010055']
 3989            Default: []
 3990        - "outputOptions" (dict):
 3991            Output options (see Exomiser docs).
 3992            Default:
 3993                "output_options" =
 3994                    {
 3995                        "outputContributingVariantsOnly": False,
 3996                        "numGenes": 0,
 3997                        "outputFormats": ["TSV_VARIANT", "VCF"]
 3998                    }
 3999        - "transcript_source" (string):
 4000            Transcript source (either "refseq", "ucsc", "ensembl")
 4001            Default: "refseq"
 4002        - "exomiser_to_info" (boolean):
 4003            Add exomiser TSV file columns as INFO fields in VCF.
 4004            Default: False
 4005        - "release" (string):
 4006            Exomise database release.
 4007            If not exists, database release will be downloaded (take a while).
 4008            Default: None (provided by application.properties configuration file)
 4009        - "exomiser_application_properties" (file):
 4010            Exomiser configuration file (see Exomiser docs).
 4011            Useful to automatically download databases (especially for specific genome databases).
 4012
 4013        Notes:
 4014        - If no sample in parameters, first sample in VCF will be chosen
 4015        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4016
 4017        :param threads: The number of threads to use
 4018        :return: None.
 4019        """
 4020
 4021        # DEBUG
 4022        log.debug("Start annotation with Exomiser databases")
 4023
 4024        # Threads
 4025        if not threads:
 4026            threads = self.get_threads()
 4027        log.debug("Threads: " + str(threads))
 4028
 4029        # Config
 4030        config = self.get_config()
 4031        log.debug("Config: " + str(config))
 4032
 4033        # Config - Folders - Databases
 4034        databases_folders = (
 4035            config.get("folders", {})
 4036            .get("databases", {})
 4037            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4038        )
 4039        databases_folders = full_path(databases_folders)
 4040        if not os.path.exists(databases_folders):
 4041            log.error(f"Databases annotations: {databases_folders} NOT found")
 4042        log.debug("Databases annotations: " + str(databases_folders))
 4043
 4044        # Config - Exomiser
 4045        exomiser_bin_command = get_bin_command(
 4046            bin="exomiser-cli*.jar",
 4047            tool="exomiser",
 4048            bin_type="jar",
 4049            config=config,
 4050            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4051        )
 4052        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4053        if not exomiser_bin_command:
 4054            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4055            log.error(msg_err)
 4056            raise ValueError(msg_err)
 4057
 4058        # Param
 4059        param = self.get_param()
 4060        log.debug("Param: " + str(param))
 4061
 4062        # Param - Exomiser
 4063        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4064        log.debug(f"Param Exomiser: {param_exomiser}")
 4065
 4066        # Param - Assembly
 4067        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4068        log.debug("Assembly: " + str(assembly))
 4069
 4070        # Data
 4071        table_variants = self.get_table_variants()
 4072
 4073        # Check if not empty
 4074        log.debug("Check if not empty")
 4075        sql_query_chromosomes = (
 4076            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4077        )
 4078        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4079            log.info(f"VCF empty")
 4080            return False
 4081
 4082        # VCF header
 4083        vcf_reader = self.get_header()
 4084        log.debug("Initial header: " + str(vcf_reader.infos))
 4085
 4086        # Samples
 4087        samples = self.get_header_sample_list()
 4088        if not samples:
 4089            log.error("No Samples in VCF")
 4090            return False
 4091        log.debug(f"Samples: {samples}")
 4092
 4093        # Memory limit
 4094        memory_limit = self.get_memory("8G")
 4095        log.debug(f"memory_limit: {memory_limit}")
 4096
 4097        # Exomiser java options
 4098        exomiser_java_options = (
 4099            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4100        )
 4101        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4102
 4103        # Download Exomiser (if not exists)
 4104        exomiser_release = param_exomiser.get("release", None)
 4105        exomiser_application_properties = param_exomiser.get(
 4106            "exomiser_application_properties", None
 4107        )
 4108        databases_download_exomiser(
 4109            assemblies=[assembly],
 4110            exomiser_folder=databases_folders,
 4111            exomiser_release=exomiser_release,
 4112            exomiser_phenotype_release=exomiser_release,
 4113            exomiser_application_properties=exomiser_application_properties,
 4114        )
 4115
 4116        # Force annotation
 4117        force_update_annotation = True
 4118
 4119        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4120            log.debug("Start annotation Exomiser")
 4121
 4122            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4123
 4124                # tmp_dir = "/tmp/exomiser"
 4125
 4126                ### ANALYSIS ###
 4127                ################
 4128
 4129                # Create analysis.json through analysis dict
 4130                # either analysis in param or by default
 4131                # depending on preset exome/genome)
 4132
 4133                # Init analysis dict
 4134                param_exomiser_analysis_dict = {}
 4135
 4136                # analysis from param
 4137                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4138                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4139
 4140                # If analysis in param -> load anlaysis json
 4141                if param_exomiser_analysis:
 4142
 4143                    # If param analysis is a file and exists
 4144                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4145                        param_exomiser_analysis
 4146                    ):
 4147                        # Load analysis file into analysis dict (either yaml or json)
 4148                        with open(param_exomiser_analysis) as json_file:
 4149                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4150
 4151                    # If param analysis is a dict
 4152                    elif isinstance(param_exomiser_analysis, dict):
 4153                        # Load analysis dict into analysis dict (either yaml or json)
 4154                        param_exomiser_analysis_dict = param_exomiser_analysis
 4155
 4156                    # Error analysis type
 4157                    else:
 4158                        log.error(f"Analysis type unknown. Check param file.")
 4159                        raise ValueError(f"Analysis type unknown. Check param file.")
 4160
 4161                # Case no input analysis config file/dict
 4162                # Use preset (exome/genome) to open default config file
 4163                if not param_exomiser_analysis_dict:
 4164
 4165                    # default preset
 4166                    default_preset = "exome"
 4167
 4168                    # Get param preset or default preset
 4169                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4170
 4171                    # Try to find if preset is a file
 4172                    if os.path.exists(param_exomiser_preset):
 4173                        # Preset file is provided in full path
 4174                        param_exomiser_analysis_default_config_file = (
 4175                            param_exomiser_preset
 4176                        )
 4177                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4178                    #     # Preset file is provided in full path
 4179                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4180                    elif os.path.exists(
 4181                        os.path.join(folder_config, param_exomiser_preset)
 4182                    ):
 4183                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4184                        param_exomiser_analysis_default_config_file = os.path.join(
 4185                            folder_config, param_exomiser_preset
 4186                        )
 4187                    else:
 4188                        # Construct preset file
 4189                        param_exomiser_analysis_default_config_file = os.path.join(
 4190                            folder_config,
 4191                            f"preset-{param_exomiser_preset}-analysis.json",
 4192                        )
 4193
 4194                    # If preset file exists
 4195                    param_exomiser_analysis_default_config_file = full_path(
 4196                        param_exomiser_analysis_default_config_file
 4197                    )
 4198                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4199                        # Load prest file into analysis dict (either yaml or json)
 4200                        with open(
 4201                            param_exomiser_analysis_default_config_file
 4202                        ) as json_file:
 4203                            # param_exomiser_analysis_dict[""] = json.load(json_file)
 4204                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4205                                json_file
 4206                            )
 4207
 4208                    # Error preset file
 4209                    else:
 4210                        log.error(
 4211                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4212                        )
 4213                        raise ValueError(
 4214                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4215                        )
 4216
 4217                # If no analysis dict created
 4218                if not param_exomiser_analysis_dict:
 4219                    log.error(f"No analysis config")
 4220                    raise ValueError(f"No analysis config")
 4221
 4222                # Log
 4223                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4224
 4225                ### PHENOPACKET ###
 4226                ###################
 4227
 4228                # If no PhenoPacket in analysis dict -> check in param
 4229                if "phenopacket" not in param_exomiser_analysis_dict:
 4230
 4231                    # If PhenoPacket in param -> load anlaysis json
 4232                    if param_exomiser.get("phenopacket", None):
 4233
 4234                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4235                        param_exomiser_phenopacket = full_path(
 4236                            param_exomiser_phenopacket
 4237                        )
 4238
 4239                        # If param phenopacket is a file and exists
 4240                        if isinstance(
 4241                            param_exomiser_phenopacket, str
 4242                        ) and os.path.exists(param_exomiser_phenopacket):
 4243                            # Load phenopacket file into analysis dict (either yaml or json)
 4244                            with open(param_exomiser_phenopacket) as json_file:
 4245                                param_exomiser_analysis_dict["phenopacket"] = (
 4246                                    yaml.safe_load(json_file)
 4247                                )
 4248
 4249                        # If param phenopacket is a dict
 4250                        elif isinstance(param_exomiser_phenopacket, dict):
 4251                            # Load phenopacket dict into analysis dict (either yaml or json)
 4252                            param_exomiser_analysis_dict["phenopacket"] = (
 4253                                param_exomiser_phenopacket
 4254                            )
 4255
 4256                        # Error phenopacket type
 4257                        else:
 4258                            log.error(f"Phenopacket type unknown. Check param file.")
 4259                            raise ValueError(
 4260                                f"Phenopacket type unknown. Check param file."
 4261                            )
 4262
 4263                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4264                if "phenopacket" not in param_exomiser_analysis_dict:
 4265
 4266                    # Init PhenoPacket
 4267                    param_exomiser_analysis_dict["phenopacket"] = {
 4268                        "id": "analysis",
 4269                        "proband": {},
 4270                    }
 4271
 4272                    ### Add subject ###
 4273
 4274                    # If subject exists
 4275                    param_exomiser_subject = param_exomiser.get("subject", {})
 4276
 4277                    # If subject not exists -> found sample ID
 4278                    if not param_exomiser_subject:
 4279
 4280                        # Found sample ID in param
 4281                        sample = param_exomiser.get("sample", None)
 4282
 4283                        # Find sample ID (first sample)
 4284                        if not sample:
 4285                            sample_list = self.get_header_sample_list()
 4286                            if len(sample_list) > 0:
 4287                                sample = sample_list[0]
 4288                            else:
 4289                                log.error(f"No sample found")
 4290                                raise ValueError(f"No sample found")
 4291
 4292                        # Create subject
 4293                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4294
 4295                    # Add to dict
 4296                    param_exomiser_analysis_dict["phenopacket"][
 4297                        "subject"
 4298                    ] = param_exomiser_subject
 4299
 4300                    ### Add "phenotypicFeatures" ###
 4301
 4302                    # If phenotypicFeatures exists
 4303                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4304                        "phenotypicFeatures", []
 4305                    )
 4306
 4307                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4308                    if not param_exomiser_phenotypicfeatures:
 4309
 4310                        # Found HPO in param
 4311                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4312
 4313                        # Split HPO if list in string format separated by comma
 4314                        if isinstance(param_exomiser_hpo, str):
 4315                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4316
 4317                        # Create HPO list
 4318                        for hpo in param_exomiser_hpo:
 4319                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4320                            param_exomiser_phenotypicfeatures.append(
 4321                                {
 4322                                    "type": {
 4323                                        "id": f"HP:{hpo_clean}",
 4324                                        "label": f"HP:{hpo_clean}",
 4325                                    }
 4326                                }
 4327                            )
 4328
 4329                    # Add to dict
 4330                    param_exomiser_analysis_dict["phenopacket"][
 4331                        "phenotypicFeatures"
 4332                    ] = param_exomiser_phenotypicfeatures
 4333
 4334                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4335                    if not param_exomiser_phenotypicfeatures:
 4336                        for step in param_exomiser_analysis_dict.get(
 4337                            "analysis", {}
 4338                        ).get("steps", []):
 4339                            if "hiPhivePrioritiser" in step:
 4340                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4341                                    "steps", []
 4342                                ).remove(step)
 4343
 4344                ### Add Input File ###
 4345
 4346                # Initial file name and htsFiles
 4347                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4348                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4349                    {
 4350                        "uri": tmp_vcf_name,
 4351                        "htsFormat": "VCF",
 4352                        "genomeAssembly": assembly,
 4353                    }
 4354                ]
 4355
 4356                ### Add metaData ###
 4357
 4358                # If metaData not in analysis dict
 4359                if "metaData" not in param_exomiser_analysis_dict:
 4360                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4361                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4362                        "createdBy": "howard",
 4363                        "phenopacketSchemaVersion": 1,
 4364                    }
 4365
 4366                ### OutputOptions ###
 4367
 4368                # Init output result folder
 4369                output_results = os.path.join(tmp_dir, "results")
 4370
 4371                # If no outputOptions in analysis dict
 4372                if "outputOptions" not in param_exomiser_analysis_dict:
 4373
 4374                    # default output formats
 4375                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4376
 4377                    # Get outputOptions in param
 4378                    output_options = param_exomiser.get("outputOptions", None)
 4379
 4380                    # If no output_options in param -> check
 4381                    if not output_options:
 4382                        output_options = {
 4383                            "outputContributingVariantsOnly": False,
 4384                            "numGenes": 0,
 4385                            "outputFormats": defaut_output_formats,
 4386                        }
 4387
 4388                    # Replace outputDirectory in output options
 4389                    output_options["outputDirectory"] = output_results
 4390                    output_options["outputFileName"] = "howard"
 4391
 4392                    # Add outputOptions in analysis dict
 4393                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4394
 4395                else:
 4396
 4397                    # Replace output_results and output format (if exists in param)
 4398                    param_exomiser_analysis_dict["outputOptions"][
 4399                        "outputDirectory"
 4400                    ] = output_results
 4401                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4402                        list(
 4403                            set(
 4404                                param_exomiser_analysis_dict.get(
 4405                                    "outputOptions", {}
 4406                                ).get("outputFormats", [])
 4407                                + ["TSV_VARIANT", "VCF"]
 4408                            )
 4409                        )
 4410                    )
 4411
 4412                # log
 4413                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4414
 4415                ### ANALYSIS FILE ###
 4416                #####################
 4417
 4418                ### Full JSON analysis config file ###
 4419
 4420                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4421                with open(exomiser_analysis, "w") as fp:
 4422                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4423
 4424                ### SPLIT analysis and sample config files
 4425
 4426                # Splitted analysis dict
 4427                param_exomiser_analysis_dict_for_split = (
 4428                    param_exomiser_analysis_dict.copy()
 4429                )
 4430
 4431                # Phenopacket JSON file
 4432                exomiser_analysis_phenopacket = os.path.join(
 4433                    tmp_dir, "analysis_phenopacket.json"
 4434                )
 4435                with open(exomiser_analysis_phenopacket, "w") as fp:
 4436                    json.dump(
 4437                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4438                        fp,
 4439                        indent=4,
 4440                    )
 4441
 4442                # Analysis JSON file without Phenopacket parameters
 4443                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4444                exomiser_analysis_analysis = os.path.join(
 4445                    tmp_dir, "analysis_analysis.json"
 4446                )
 4447                with open(exomiser_analysis_analysis, "w") as fp:
 4448                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4449
 4450                ### INITAL VCF file ###
 4451                #######################
 4452
 4453                ### Create list of samples to use and include inti initial VCF file ####
 4454
 4455                # Subject (main sample)
 4456                # Get sample ID in analysis dict
 4457                sample_subject = (
 4458                    param_exomiser_analysis_dict.get("phenopacket", {})
 4459                    .get("subject", {})
 4460                    .get("id", None)
 4461                )
 4462                sample_proband = (
 4463                    param_exomiser_analysis_dict.get("phenopacket", {})
 4464                    .get("proband", {})
 4465                    .get("subject", {})
 4466                    .get("id", None)
 4467                )
 4468                sample = []
 4469                if sample_subject:
 4470                    sample.append(sample_subject)
 4471                if sample_proband:
 4472                    sample.append(sample_proband)
 4473
 4474                # Get sample ID within Pedigree
 4475                pedigree_persons_list = (
 4476                    param_exomiser_analysis_dict.get("phenopacket", {})
 4477                    .get("pedigree", {})
 4478                    .get("persons", {})
 4479                )
 4480
 4481                # Create list with all sample ID in pedigree (if exists)
 4482                pedigree_persons = []
 4483                for person in pedigree_persons_list:
 4484                    pedigree_persons.append(person.get("individualId"))
 4485
 4486                # Concat subject sample ID and samples ID in pedigreesamples
 4487                samples = list(set(sample + pedigree_persons))
 4488
 4489                # Check if sample list is not empty
 4490                if not samples:
 4491                    log.error(f"No samples found")
 4492                    raise ValueError(f"No samples found")
 4493
 4494                # Create VCF with sample (either sample in param or first one by default)
 4495                # Export VCF file
 4496                self.export_variant_vcf(
 4497                    vcf_file=tmp_vcf_name,
 4498                    remove_info=True,
 4499                    add_samples=True,
 4500                    list_samples=samples,
 4501                    index=False,
 4502                )
 4503
 4504                ### Execute Exomiser ###
 4505                ########################
 4506
 4507                # Init command
 4508                exomiser_command = ""
 4509
 4510                # Command exomiser options
 4511                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 4512
 4513                # Release
 4514                exomiser_release = param_exomiser.get("release", None)
 4515                if exomiser_release:
 4516                    # phenotype data version
 4517                    exomiser_options += (
 4518                        f" --exomiser.phenotype.data-version={exomiser_release} "
 4519                    )
 4520                    # data version
 4521                    exomiser_options += (
 4522                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 4523                    )
 4524                    # variant white list
 4525                    variant_white_list_file = (
 4526                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 4527                    )
 4528                    if os.path.exists(
 4529                        os.path.join(
 4530                            databases_folders, assembly, variant_white_list_file
 4531                        )
 4532                    ):
 4533                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 4534
 4535                # transcript_source
 4536                transcript_source = param_exomiser.get(
 4537                    "transcript_source", None
 4538                )  # ucsc, refseq, ensembl
 4539                if transcript_source:
 4540                    exomiser_options += (
 4541                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 4542                    )
 4543
 4544                # If analysis contain proband param
 4545                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 4546                    "proband", {}
 4547                ):
 4548                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 4549
 4550                # If no proband (usually uniq sample)
 4551                else:
 4552                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 4553
 4554                # Log
 4555                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 4556
 4557                # Run command
 4558                result = subprocess.call(
 4559                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 4560                )
 4561                if result:
 4562                    log.error("Exomiser command failed")
 4563                    raise ValueError("Exomiser command failed")
 4564
 4565                ### RESULTS ###
 4566                ###############
 4567
 4568                ### Annotate with TSV fields ###
 4569
 4570                # Init result tsv file
 4571                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 4572
 4573                # Init result tsv file
 4574                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 4575
 4576                # Parse TSV file and explode columns in INFO field
 4577                if exomiser_to_info and os.path.exists(output_results_tsv):
 4578
 4579                    # Log
 4580                    log.debug("Exomiser columns to VCF INFO field")
 4581
 4582                    # Retrieve columns and types
 4583                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 4584                    output_results_tsv_df = self.get_query_to_df(query)
 4585                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 4586
 4587                    # Init concat fields for update
 4588                    sql_query_update_concat_fields = []
 4589
 4590                    # Fields to avoid
 4591                    fields_to_avoid = [
 4592                        "CONTIG",
 4593                        "START",
 4594                        "END",
 4595                        "REF",
 4596                        "ALT",
 4597                        "QUAL",
 4598                        "FILTER",
 4599                        "GENOTYPE",
 4600                    ]
 4601
 4602                    # List all columns to add into header
 4603                    for header_column in output_results_tsv_columns:
 4604
 4605                        # If header column is enable
 4606                        if header_column not in fields_to_avoid:
 4607
 4608                            # Header info type
 4609                            header_info_type = "String"
 4610                            header_column_df = output_results_tsv_df[header_column]
 4611                            header_column_df_dtype = header_column_df.dtype
 4612                            if header_column_df_dtype == object:
 4613                                if (
 4614                                    pd.to_numeric(header_column_df, errors="coerce")
 4615                                    .notnull()
 4616                                    .all()
 4617                                ):
 4618                                    header_info_type = "Float"
 4619                            else:
 4620                                header_info_type = "Integer"
 4621
 4622                            # Header info
 4623                            characters_to_validate = ["-"]
 4624                            pattern = "[" + "".join(characters_to_validate) + "]"
 4625                            header_info_name = re.sub(
 4626                                pattern,
 4627                                "_",
 4628                                f"Exomiser_{header_column}".replace("#", ""),
 4629                            )
 4630                            header_info_number = "."
 4631                            header_info_description = (
 4632                                f"Exomiser {header_column} annotation"
 4633                            )
 4634                            header_info_source = "Exomiser"
 4635                            header_info_version = "unknown"
 4636                            header_info_code = CODE_TYPE_MAP[header_info_type]
 4637                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 4638                                header_info_name,
 4639                                header_info_number,
 4640                                header_info_type,
 4641                                header_info_description,
 4642                                header_info_source,
 4643                                header_info_version,
 4644                                header_info_code,
 4645                            )
 4646
 4647                            # Add field to add for update to concat fields
 4648                            sql_query_update_concat_fields.append(
 4649                                f"""
 4650                                CASE
 4651                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 4652                                    THEN concat(
 4653                                        '{header_info_name}=',
 4654                                        table_parquet."{header_column}",
 4655                                        ';'
 4656                                        )
 4657
 4658                                    ELSE ''
 4659                                END
 4660                            """
 4661                            )
 4662
 4663                    # Update query
 4664                    sql_query_update = f"""
 4665                        UPDATE {table_variants} as table_variants
 4666                            SET INFO = concat(
 4667                                            CASE
 4668                                                WHEN INFO NOT IN ('', '.')
 4669                                                THEN INFO
 4670                                                ELSE ''
 4671                                            END,
 4672                                            CASE
 4673                                                WHEN table_variants.INFO NOT IN ('','.')
 4674                                                THEN ';'
 4675                                                ELSE ''
 4676                                            END,
 4677                                            (
 4678                                            SELECT 
 4679                                                concat(
 4680                                                    {",".join(sql_query_update_concat_fields)}
 4681                                                )
 4682                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 4683                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 4684                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 4685                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 4686                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 4687                                            )
 4688                                        )
 4689                            ;
 4690                        """
 4691
 4692                    # Update
 4693                    self.conn.execute(sql_query_update)
 4694
 4695                ### Annotate with VCF INFO field ###
 4696
 4697                # Init result VCF file
 4698                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 4699
 4700                # If VCF exists
 4701                if os.path.exists(output_results_vcf):
 4702
 4703                    # Log
 4704                    log.debug("Exomiser result VCF update variants")
 4705
 4706                    # Find Exomiser INFO field annotation in header
 4707                    with gzip.open(output_results_vcf, "rt") as f:
 4708                        header_list = self.read_vcf_header(f)
 4709                    exomiser_vcf_header = vcf.Reader(
 4710                        io.StringIO("\n".join(header_list))
 4711                    )
 4712
 4713                    # Add annotation INFO field to header
 4714                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 4715
 4716                    # Update variants with VCF
 4717                    self.update_from_vcf(output_results_vcf)
 4718
 4719        return True
 4720
 4721    def annotation_snpeff(self, threads: int = None) -> None:
 4722        """
 4723        This function annotate with snpEff
 4724
 4725        :param threads: The number of threads to use
 4726        :return: the value of the variable "return_value".
 4727        """
 4728
 4729        # DEBUG
 4730        log.debug("Start annotation with snpeff databases")
 4731
 4732        # Threads
 4733        if not threads:
 4734            threads = self.get_threads()
 4735        log.debug("Threads: " + str(threads))
 4736
 4737        # DEBUG
 4738        delete_tmp = True
 4739        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4740            delete_tmp = False
 4741            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4742
 4743        # Config
 4744        config = self.get_config()
 4745        log.debug("Config: " + str(config))
 4746
 4747        # Config - Folders - Databases
 4748        databases_folders = (
 4749            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 4750        )
 4751        log.debug("Databases annotations: " + str(databases_folders))
 4752
 4753        # # Config - Java
 4754        # java_bin = get_bin(
 4755        #     tool="java",
 4756        #     bin="java",
 4757        #     bin_type="bin",
 4758        #     config=config,
 4759        #     default_folder="/usr/bin",
 4760        # )
 4761        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
 4762        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
 4763        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
 4764
 4765        # # Config - snpEff bin
 4766        # snpeff_jar = get_bin(
 4767        #     tool="snpeff",
 4768        #     bin="snpEff.jar",
 4769        #     bin_type="jar",
 4770        #     config=config,
 4771        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 4772        # )
 4773        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
 4774        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
 4775        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
 4776
 4777        # Config - snpEff bin command
 4778        snpeff_bin_command = get_bin_command(
 4779            bin="snpEff.jar",
 4780            tool="snpeff",
 4781            bin_type="jar",
 4782            config=config,
 4783            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 4784        )
 4785        if not snpeff_bin_command:
 4786            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 4787            log.error(msg_err)
 4788            raise ValueError(msg_err)
 4789
 4790        # Config - snpEff databases
 4791        snpeff_databases = (
 4792            config.get("folders", {})
 4793            .get("databases", {})
 4794            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 4795        )
 4796        snpeff_databases = full_path(snpeff_databases)
 4797        if snpeff_databases is not None and snpeff_databases != "":
 4798            log.debug(f"Create snpEff databases folder")
 4799            if not os.path.exists(snpeff_databases):
 4800                os.makedirs(snpeff_databases)
 4801
 4802        # Param
 4803        param = self.get_param()
 4804        log.debug("Param: " + str(param))
 4805
 4806        # Param
 4807        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 4808        log.debug("Options: " + str(options))
 4809
 4810        # Param - Assembly
 4811        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4812
 4813        # Param - Options
 4814        snpeff_options = (
 4815            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 4816        )
 4817        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 4818        snpeff_csvstats = (
 4819            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 4820        )
 4821        if snpeff_stats:
 4822            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 4823            snpeff_stats = full_path(snpeff_stats)
 4824            snpeff_options += f" -stats {snpeff_stats}"
 4825        if snpeff_csvstats:
 4826            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 4827            snpeff_csvstats = full_path(snpeff_csvstats)
 4828            snpeff_options += f" -csvStats {snpeff_csvstats}"
 4829
 4830        # Data
 4831        table_variants = self.get_table_variants()
 4832
 4833        # Check if not empty
 4834        log.debug("Check if not empty")
 4835        sql_query_chromosomes = (
 4836            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4837        )
 4838        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 4839        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4840            log.info(f"VCF empty")
 4841            return
 4842
 4843        # Export in VCF
 4844        log.debug("Create initial file to annotate")
 4845        tmp_vcf = NamedTemporaryFile(
 4846            prefix=self.get_prefix(),
 4847            dir=self.get_tmp_dir(),
 4848            suffix=".vcf.gz",
 4849            delete=True,
 4850        )
 4851        tmp_vcf_name = tmp_vcf.name
 4852
 4853        # VCF header
 4854        vcf_reader = self.get_header()
 4855        log.debug("Initial header: " + str(vcf_reader.infos))
 4856
 4857        # Existing annotations
 4858        for vcf_annotation in self.get_header().infos:
 4859
 4860            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4861            log.debug(
 4862                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4863            )
 4864
 4865        # Memory limit
 4866        # if config.get("memory", None):
 4867        #     memory_limit = config.get("memory", "8G")
 4868        # else:
 4869        #     memory_limit = "8G"
 4870        memory_limit = self.get_memory("8G")
 4871        log.debug(f"memory_limit: {memory_limit}")
 4872
 4873        # snpEff java options
 4874        snpeff_java_options = (
 4875            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4876        )
 4877        log.debug(f"Exomiser java options: {snpeff_java_options}")
 4878
 4879        force_update_annotation = True
 4880
 4881        if "ANN" not in self.get_header().infos or force_update_annotation:
 4882
 4883            # Check snpEff database
 4884            log.debug(f"Check snpEff databases {[assembly]}")
 4885            databases_download_snpeff(
 4886                folder=snpeff_databases, assemblies=[assembly], config=config
 4887            )
 4888
 4889            # Export VCF file
 4890            self.export_variant_vcf(
 4891                vcf_file=tmp_vcf_name,
 4892                remove_info=True,
 4893                add_samples=False,
 4894                index=True,
 4895            )
 4896
 4897            # Tmp file
 4898            err_files = []
 4899            tmp_annotate_vcf = NamedTemporaryFile(
 4900                prefix=self.get_prefix(),
 4901                dir=self.get_tmp_dir(),
 4902                suffix=".vcf",
 4903                delete=False,
 4904            )
 4905            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4906            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4907            err_files.append(tmp_annotate_vcf_name_err)
 4908
 4909            # Command
 4910            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 4911            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 4912            run_parallel_commands([snpeff_command], 1)
 4913
 4914            # Error messages
 4915            log.info(f"Error/Warning messages:")
 4916            error_message_command_all = []
 4917            error_message_command_warning = []
 4918            error_message_command_err = []
 4919            for err_file in err_files:
 4920                with open(err_file, "r") as f:
 4921                    for line in f:
 4922                        message = line.strip()
 4923                        error_message_command_all.append(message)
 4924                        if line.startswith("[W::"):
 4925                            error_message_command_warning.append(message)
 4926                        if line.startswith("[E::"):
 4927                            error_message_command_err.append(f"{err_file}: " + message)
 4928            # log info
 4929            for message in list(
 4930                set(error_message_command_err + error_message_command_warning)
 4931            ):
 4932                log.info(f"   {message}")
 4933            # debug info
 4934            for message in list(set(error_message_command_all)):
 4935                log.debug(f"   {message}")
 4936            # failed
 4937            if len(error_message_command_err):
 4938                log.error("Annotation failed: Error in commands")
 4939                raise ValueError("Annotation failed: Error in commands")
 4940
 4941            # Find annotation in header
 4942            with open(tmp_annotate_vcf_name, "rt") as f:
 4943                header_list = self.read_vcf_header(f)
 4944            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 4945
 4946            for ann in annovar_vcf_header.infos:
 4947                if ann not in self.get_header().infos:
 4948                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 4949
 4950            # Update variants
 4951            log.info(f"Annotation - Updating...")
 4952            self.update_from_vcf(tmp_annotate_vcf_name)
 4953
 4954        else:
 4955            if "ANN" in self.get_header().infos:
 4956                log.debug(f"Existing snpEff annotations in VCF")
 4957            if force_update_annotation:
 4958                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 4959
 4960    def annotation_annovar(self, threads: int = None) -> None:
 4961        """
 4962        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 4963        annotations
 4964
 4965        :param threads: number of threads to use
 4966        :return: the value of the variable "return_value".
 4967        """
 4968
 4969        # DEBUG
 4970        log.debug("Start annotation with Annovar databases")
 4971
 4972        # Threads
 4973        if not threads:
 4974            threads = self.get_threads()
 4975        log.debug("Threads: " + str(threads))
 4976
 4977        # Tmp en Err files
 4978        tmp_files = []
 4979        err_files = []
 4980
 4981        # DEBUG
 4982        delete_tmp = True
 4983        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4984            delete_tmp = False
 4985            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4986
 4987        # Config
 4988        config = self.get_config()
 4989        log.debug("Config: " + str(config))
 4990
 4991        # Config - Folders - Databases
 4992        databases_folders = (
 4993            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 4994        )
 4995        log.debug("Databases annotations: " + str(databases_folders))
 4996
 4997        # Config - annovar bin command
 4998        annovar_bin_command = get_bin_command(
 4999            bin="table_annovar.pl",
 5000            tool="annovar",
 5001            bin_type="perl",
 5002            config=config,
 5003            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5004        )
 5005        if not annovar_bin_command:
 5006            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5007            log.error(msg_err)
 5008            raise ValueError(msg_err)
 5009
 5010        # Config - BCFTools bin command
 5011        bcftools_bin_command = get_bin_command(
 5012            bin="bcftools",
 5013            tool="bcftools",
 5014            bin_type="bin",
 5015            config=config,
 5016            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5017        )
 5018        if not bcftools_bin_command:
 5019            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5020            log.error(msg_err)
 5021            raise ValueError(msg_err)
 5022
 5023        # Config - annovar databases
 5024        annovar_databases = (
 5025            config.get("folders", {})
 5026            .get("databases", {})
 5027            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5028        )
 5029        annovar_databases = full_path(annovar_databases)
 5030        if annovar_databases != "" and not os.path.exists(annovar_databases):
 5031            os.makedirs(annovar_databases)
 5032
 5033        # Param
 5034        param = self.get_param()
 5035        log.debug("Param: " + str(param))
 5036
 5037        # Param - options
 5038        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5039        log.debug("Options: " + str(options))
 5040
 5041        # Param - annotations
 5042        annotations = (
 5043            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5044        )
 5045        log.debug("Annotations: " + str(annotations))
 5046
 5047        # Param - Assembly
 5048        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5049
 5050        # Annovar database assembly
 5051        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5052        if annovar_databases_assembly != "" and not os.path.exists(
 5053            annovar_databases_assembly
 5054        ):
 5055            os.makedirs(annovar_databases_assembly)
 5056
 5057        # Data
 5058        table_variants = self.get_table_variants()
 5059
 5060        # Check if not empty
 5061        log.debug("Check if not empty")
 5062        sql_query_chromosomes = (
 5063            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5064        )
 5065        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5066        if not sql_query_chromosomes_df["count"][0]:
 5067            log.info(f"VCF empty")
 5068            return
 5069
 5070        # VCF header
 5071        vcf_reader = self.get_header()
 5072        log.debug("Initial header: " + str(vcf_reader.infos))
 5073
 5074        # Existing annotations
 5075        for vcf_annotation in self.get_header().infos:
 5076
 5077            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5078            log.debug(
 5079                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5080            )
 5081
 5082        force_update_annotation = True
 5083
 5084        if annotations:
 5085
 5086            commands = []
 5087            tmp_annotates_vcf_name_list = []
 5088
 5089            # Export in VCF
 5090            log.debug("Create initial file to annotate")
 5091            tmp_vcf = NamedTemporaryFile(
 5092                prefix=self.get_prefix(),
 5093                dir=self.get_tmp_dir(),
 5094                suffix=".vcf.gz",
 5095                delete=False,
 5096            )
 5097            tmp_vcf_name = tmp_vcf.name
 5098            tmp_files.append(tmp_vcf_name)
 5099            tmp_files.append(tmp_vcf_name + ".tbi")
 5100
 5101            # Export VCF file
 5102            self.export_variant_vcf(
 5103                vcf_file=tmp_vcf_name,
 5104                remove_info=".",
 5105                add_samples=False,
 5106                index=True,
 5107            )
 5108
 5109            # Create file for field rename
 5110            log.debug("Create file for field rename")
 5111            tmp_rename = NamedTemporaryFile(
 5112                prefix=self.get_prefix(),
 5113                dir=self.get_tmp_dir(),
 5114                suffix=".rename",
 5115                delete=False,
 5116            )
 5117            tmp_rename_name = tmp_rename.name
 5118            tmp_files.append(tmp_rename_name)
 5119
 5120            # Check Annovar database
 5121            log.debug(
 5122                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5123            )
 5124            databases_download_annovar(
 5125                folder=annovar_databases,
 5126                files=list(annotations.keys()),
 5127                assemblies=[assembly],
 5128            )
 5129
 5130            for annotation in annotations:
 5131                annotation_fields = annotations[annotation]
 5132
 5133                if not annotation_fields:
 5134                    annotation_fields = {"INFO": None}
 5135
 5136                log.info(f"Annotations Annovar - database '{annotation}'")
 5137                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5138
 5139                # Tmp file for annovar
 5140                err_files = []
 5141                tmp_annotate_vcf_directory = TemporaryDirectory(
 5142                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5143                )
 5144                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5145                tmp_annotate_vcf_name_annovar = (
 5146                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5147                )
 5148                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5149                err_files.append(tmp_annotate_vcf_name_err)
 5150                tmp_files.append(tmp_annotate_vcf_name_err)
 5151
 5152                # Tmp file final vcf annotated by annovar
 5153                tmp_annotate_vcf = NamedTemporaryFile(
 5154                    prefix=self.get_prefix(),
 5155                    dir=self.get_tmp_dir(),
 5156                    suffix=".vcf.gz",
 5157                    delete=False,
 5158                )
 5159                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5160                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5161                tmp_files.append(tmp_annotate_vcf_name)
 5162                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5163
 5164                # Number of fields
 5165                annotation_list = []
 5166                annotation_renamed_list = []
 5167
 5168                for annotation_field in annotation_fields:
 5169
 5170                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5171                    annotation_fields_new_name = annotation_fields.get(
 5172                        annotation_field, annotation_field
 5173                    )
 5174                    if not annotation_fields_new_name:
 5175                        annotation_fields_new_name = annotation_field
 5176
 5177                    if (
 5178                        force_update_annotation
 5179                        or annotation_fields_new_name not in self.get_header().infos
 5180                    ):
 5181                        annotation_list.append(annotation_field)
 5182                        annotation_renamed_list.append(annotation_fields_new_name)
 5183                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5184                        log.warning(
 5185                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5186                        )
 5187
 5188                    # Add rename info
 5189                    run_parallel_commands(
 5190                        [
 5191                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5192                        ],
 5193                        1,
 5194                    )
 5195
 5196                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5197                log.debug("annotation_list: " + str(annotation_list))
 5198
 5199                # protocol
 5200                protocol = annotation
 5201
 5202                # argument
 5203                argument = ""
 5204
 5205                # operation
 5206                operation = "f"
 5207                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5208                    "ensGene"
 5209                ):
 5210                    operation = "g"
 5211                    if options.get("genebase", None):
 5212                        argument = f"""'{options.get("genebase","")}'"""
 5213                elif annotation in ["cytoBand"]:
 5214                    operation = "r"
 5215
 5216                # argument option
 5217                argument_option = ""
 5218                if argument != "":
 5219                    argument_option = " --argument " + argument
 5220
 5221                # command options
 5222                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5223                for option in options:
 5224                    if option not in ["genebase"]:
 5225                        command_options += f""" --{option}={options[option]}"""
 5226
 5227                # Command
 5228
 5229                # Command - Annovar
 5230                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5231                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5232
 5233                # Command - start pipe
 5234                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5235
 5236                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5237                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5238
 5239                # Command - Special characters (refGene annotation)
 5240                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5241
 5242                # Command - Clean empty fields (with value ".")
 5243                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5244
 5245                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5246                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5247                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5248                    # for ann in annotation_renamed_list:
 5249                    for ann in annotation_list:
 5250                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5251
 5252                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5253
 5254                # Command - indexing
 5255                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5256
 5257                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5258                run_parallel_commands([command_annovar], 1)
 5259
 5260                # Error messages
 5261                log.info(f"Error/Warning messages:")
 5262                error_message_command_all = []
 5263                error_message_command_warning = []
 5264                error_message_command_err = []
 5265                for err_file in err_files:
 5266                    with open(err_file, "r") as f:
 5267                        for line in f:
 5268                            message = line.strip()
 5269                            error_message_command_all.append(message)
 5270                            if line.startswith("[W::") or line.startswith("WARNING"):
 5271                                error_message_command_warning.append(message)
 5272                            if line.startswith("[E::") or line.startswith("ERROR"):
 5273                                error_message_command_err.append(
 5274                                    f"{err_file}: " + message
 5275                                )
 5276                # log info
 5277                for message in list(
 5278                    set(error_message_command_err + error_message_command_warning)
 5279                ):
 5280                    log.info(f"   {message}")
 5281                # debug info
 5282                for message in list(set(error_message_command_all)):
 5283                    log.debug(f"   {message}")
 5284                # failed
 5285                if len(error_message_command_err):
 5286                    log.error("Annotation failed: Error in commands")
 5287                    raise ValueError("Annotation failed: Error in commands")
 5288
 5289            if tmp_annotates_vcf_name_list:
 5290
 5291                # List of annotated files
 5292                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5293
 5294                # Tmp file
 5295                tmp_annotate_vcf = NamedTemporaryFile(
 5296                    prefix=self.get_prefix(),
 5297                    dir=self.get_tmp_dir(),
 5298                    suffix=".vcf.gz",
 5299                    delete=False,
 5300                )
 5301                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5302                tmp_files.append(tmp_annotate_vcf_name)
 5303                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5304                err_files.append(tmp_annotate_vcf_name_err)
 5305                tmp_files.append(tmp_annotate_vcf_name_err)
 5306
 5307                # Command merge
 5308                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5309                log.info(
 5310                    f"Annotation Annovar - Annotation merging "
 5311                    + str(len(tmp_annotates_vcf_name_list))
 5312                    + " annotated files"
 5313                )
 5314                log.debug(f"Annotation - merge command: {merge_command}")
 5315                run_parallel_commands([merge_command], 1)
 5316
 5317                # Find annotation in header
 5318                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5319                    header_list = self.read_vcf_header(f)
 5320                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5321
 5322                for ann in annovar_vcf_header.infos:
 5323                    if ann not in self.get_header().infos:
 5324                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5325
 5326                # Update variants
 5327                log.info(f"Annotation Annovar - Updating...")
 5328                self.update_from_vcf(tmp_annotate_vcf_name)
 5329
 5330            # Clean files
 5331            # Tmp file remove command
 5332            if True:
 5333                tmp_files_remove_command = ""
 5334                if tmp_files:
 5335                    tmp_files_remove_command = " ".join(tmp_files)
 5336                clean_command = f" rm -f {tmp_files_remove_command} "
 5337                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5338                log.debug(f"Annotation - cleaning command: {clean_command}")
 5339                run_parallel_commands([clean_command], 1)
 5340
 5341    # Parquet
 5342    def annotation_parquet(self, threads: int = None) -> None:
 5343        """
 5344        It takes a VCF file, and annotates it with a parquet file
 5345
 5346        :param threads: number of threads to use for the annotation
 5347        :return: the value of the variable "result".
 5348        """
 5349
 5350        # DEBUG
 5351        log.debug("Start annotation with parquet databases")
 5352
 5353        # Threads
 5354        if not threads:
 5355            threads = self.get_threads()
 5356        log.debug("Threads: " + str(threads))
 5357
 5358        # DEBUG
 5359        delete_tmp = True
 5360        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5361            delete_tmp = False
 5362            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5363
 5364        # Config
 5365        databases_folders = set(
 5366            self.get_config()
 5367            .get("folders", {})
 5368            .get("databases", {})
 5369            .get("annotations", ["."])
 5370            + self.get_config()
 5371            .get("folders", {})
 5372            .get("databases", {})
 5373            .get("parquet", ["."])
 5374        )
 5375        log.debug("Databases annotations: " + str(databases_folders))
 5376
 5377        # Param
 5378        annotations = (
 5379            self.get_param()
 5380            .get("annotation", {})
 5381            .get("parquet", {})
 5382            .get("annotations", None)
 5383        )
 5384        log.debug("Annotations: " + str(annotations))
 5385
 5386        # Assembly
 5387        assembly = self.get_param().get(
 5388            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5389        )
 5390
 5391        # Force Update Annotation
 5392        force_update_annotation = (
 5393            self.get_param()
 5394            .get("annotation", {})
 5395            .get("options", {})
 5396            .get("annotations_update", False)
 5397        )
 5398        log.debug(f"force_update_annotation={force_update_annotation}")
 5399        force_append_annotation = (
 5400            self.get_param()
 5401            .get("annotation", {})
 5402            .get("options", {})
 5403            .get("annotations_append", False)
 5404        )
 5405        log.debug(f"force_append_annotation={force_append_annotation}")
 5406
 5407        # Data
 5408        table_variants = self.get_table_variants()
 5409
 5410        # Check if not empty
 5411        log.debug("Check if not empty")
 5412        sql_query_chromosomes_df = self.get_query_to_df(
 5413            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5414        )
 5415        if not sql_query_chromosomes_df["count"][0]:
 5416            log.info(f"VCF empty")
 5417            return
 5418
 5419        # VCF header
 5420        vcf_reader = self.get_header()
 5421        log.debug("Initial header: " + str(vcf_reader.infos))
 5422
 5423        # Nb Variants POS
 5424        log.debug("NB Variants Start")
 5425        nb_variants = self.conn.execute(
 5426            f"SELECT count(*) AS count FROM variants"
 5427        ).fetchdf()["count"][0]
 5428        log.debug("NB Variants Stop")
 5429
 5430        # Existing annotations
 5431        for vcf_annotation in self.get_header().infos:
 5432
 5433            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5434            log.debug(
 5435                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5436            )
 5437
 5438        # Added columns
 5439        added_columns = []
 5440
 5441        # drop indexes
 5442        log.debug(f"Drop indexes...")
 5443        self.drop_indexes()
 5444
 5445        if annotations:
 5446
 5447            if "ALL" in annotations:
 5448
 5449                all_param = annotations.get("ALL", {})
 5450                all_param_formats = all_param.get("formats", None)
 5451                all_param_releases = all_param.get("releases", None)
 5452
 5453                databases_infos_dict = self.scan_databases(
 5454                    database_formats=all_param_formats,
 5455                    database_releases=all_param_releases,
 5456                )
 5457                for database_infos in databases_infos_dict.keys():
 5458                    if database_infos not in annotations:
 5459                        annotations[database_infos] = {"INFO": None}
 5460
 5461            for annotation in annotations:
 5462
 5463                if annotation in ["ALL"]:
 5464                    continue
 5465
 5466                # Annotation Name
 5467                annotation_name = os.path.basename(annotation)
 5468
 5469                # Annotation fields
 5470                annotation_fields = annotations[annotation]
 5471                if not annotation_fields:
 5472                    annotation_fields = {"INFO": None}
 5473
 5474                log.debug(f"Annotation '{annotation_name}'")
 5475                log.debug(
 5476                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5477                )
 5478
 5479                # Create Database
 5480                database = Database(
 5481                    database=annotation,
 5482                    databases_folders=databases_folders,
 5483                    assembly=assembly,
 5484                )
 5485
 5486                # Find files
 5487                parquet_file = database.get_database()
 5488                parquet_hdr_file = database.get_header_file()
 5489                parquet_type = database.get_type()
 5490
 5491                # Check if files exists
 5492                if not parquet_file or not parquet_hdr_file:
 5493                    log.error("Annotation failed: file not found")
 5494                    raise ValueError("Annotation failed: file not found")
 5495                else:
 5496                    # Get parquet connexion
 5497                    parquet_sql_attach = database.get_sql_database_attach(
 5498                        output="query"
 5499                    )
 5500                    if parquet_sql_attach:
 5501                        self.conn.execute(parquet_sql_attach)
 5502                    parquet_file_link = database.get_sql_database_link()
 5503                    # Log
 5504                    log.debug(
 5505                        f"Annotation '{annotation_name}' - file: "
 5506                        + str(parquet_file)
 5507                        + " and "
 5508                        + str(parquet_hdr_file)
 5509                    )
 5510
 5511                    # Database full header columns
 5512                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 5513                        parquet_hdr_file
 5514                    )
 5515                    # Log
 5516                    log.debug(
 5517                        "Annotation database header columns : "
 5518                        + str(parquet_hdr_vcf_header_columns)
 5519                    )
 5520
 5521                    # Load header as VCF object
 5522                    parquet_hdr_vcf_header_infos = database.get_header().infos
 5523                    # Log
 5524                    log.debug(
 5525                        "Annotation database header: "
 5526                        + str(parquet_hdr_vcf_header_infos)
 5527                    )
 5528
 5529                    # Get extra infos
 5530                    parquet_columns = database.get_extra_columns()
 5531                    # Log
 5532                    log.debug("Annotation database Columns: " + str(parquet_columns))
 5533
 5534                    # Add extra columns if "ALL" in annotation_fields
 5535                    # if "ALL" in annotation_fields:
 5536                    #     allow_add_extra_column = True
 5537                    if "ALL" in annotation_fields and database.get_extra_columns():
 5538                        for extra_column in database.get_extra_columns():
 5539                            if (
 5540                                extra_column not in annotation_fields
 5541                                and extra_column.replace("INFO/", "")
 5542                                not in parquet_hdr_vcf_header_infos
 5543                            ):
 5544                                parquet_hdr_vcf_header_infos[extra_column] = (
 5545                                    vcf.parser._Info(
 5546                                        extra_column,
 5547                                        ".",
 5548                                        "String",
 5549                                        f"{extra_column} description",
 5550                                        "unknown",
 5551                                        "unknown",
 5552                                        self.code_type_map["String"],
 5553                                    )
 5554                                )
 5555
 5556                    # For all fields in database
 5557                    annotation_fields_all = False
 5558                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 5559                        annotation_fields_all = True
 5560                        annotation_fields = {
 5561                            key: key for key in parquet_hdr_vcf_header_infos
 5562                        }
 5563
 5564                        log.debug(
 5565                            "Annotation database header - All annotations added: "
 5566                            + str(annotation_fields)
 5567                        )
 5568
 5569                    # Init
 5570
 5571                    # List of annotation fields to use
 5572                    sql_query_annotation_update_info_sets = []
 5573
 5574                    # List of annotation to agregate
 5575                    sql_query_annotation_to_agregate = []
 5576
 5577                    # Number of fields
 5578                    nb_annotation_field = 0
 5579
 5580                    # Annotation fields processed
 5581                    annotation_fields_processed = []
 5582
 5583                    # Columns mapping
 5584                    map_columns = database.map_columns(
 5585                        columns=annotation_fields, prefixes=["INFO/"]
 5586                    )
 5587
 5588                    # Query dict for fields to remove (update option)
 5589                    query_dict_remove = {}
 5590
 5591                    # Fetch Anotation fields
 5592                    for annotation_field in annotation_fields:
 5593
 5594                        # annotation_field_column
 5595                        annotation_field_column = map_columns.get(
 5596                            annotation_field, "INFO"
 5597                        )
 5598
 5599                        # field new name, if parametered
 5600                        annotation_fields_new_name = annotation_fields.get(
 5601                            annotation_field, annotation_field
 5602                        )
 5603                        if not annotation_fields_new_name:
 5604                            annotation_fields_new_name = annotation_field
 5605
 5606                        # To annotate
 5607                        # force_update_annotation = True
 5608                        # force_append_annotation = True
 5609                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 5610                        if annotation_field in parquet_hdr_vcf_header_infos and (
 5611                            force_update_annotation
 5612                            or force_append_annotation
 5613                            or (
 5614                                annotation_fields_new_name
 5615                                not in self.get_header().infos
 5616                            )
 5617                        ):
 5618
 5619                            # Add field to annotation to process list
 5620                            annotation_fields_processed.append(
 5621                                annotation_fields_new_name
 5622                            )
 5623
 5624                            # explode infos for the field
 5625                            annotation_fields_new_name_info_msg = ""
 5626                            if (
 5627                                force_update_annotation
 5628                                and annotation_fields_new_name
 5629                                in self.get_header().infos
 5630                            ):
 5631                                # Remove field from INFO
 5632                                query = f"""
 5633                                    UPDATE {table_variants} as table_variants
 5634                                    SET INFO = REGEXP_REPLACE(
 5635                                                concat(table_variants.INFO,''),
 5636                                                ';*{annotation_fields_new_name}=[^;]*',
 5637                                                ''
 5638                                                )
 5639                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 5640                                """
 5641                                annotation_fields_new_name_info_msg = " [update]"
 5642                                query_dict_remove[
 5643                                    f"remove 'INFO/{annotation_fields_new_name}'"
 5644                                ] = query
 5645
 5646                            # Sep between fields in INFO
 5647                            nb_annotation_field += 1
 5648                            if nb_annotation_field > 1:
 5649                                annotation_field_sep = ";"
 5650                            else:
 5651                                annotation_field_sep = ""
 5652
 5653                            log.info(
 5654                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 5655                            )
 5656
 5657                            # Add INFO field to header
 5658                            parquet_hdr_vcf_header_infos_number = (
 5659                                parquet_hdr_vcf_header_infos[annotation_field].num
 5660                                or "."
 5661                            )
 5662                            parquet_hdr_vcf_header_infos_type = (
 5663                                parquet_hdr_vcf_header_infos[annotation_field].type
 5664                                or "String"
 5665                            )
 5666                            parquet_hdr_vcf_header_infos_description = (
 5667                                parquet_hdr_vcf_header_infos[annotation_field].desc
 5668                                or f"{annotation_field} description"
 5669                            )
 5670                            parquet_hdr_vcf_header_infos_source = (
 5671                                parquet_hdr_vcf_header_infos[annotation_field].source
 5672                                or "unknown"
 5673                            )
 5674                            parquet_hdr_vcf_header_infos_version = (
 5675                                parquet_hdr_vcf_header_infos[annotation_field].version
 5676                                or "unknown"
 5677                            )
 5678
 5679                            vcf_reader.infos[annotation_fields_new_name] = (
 5680                                vcf.parser._Info(
 5681                                    annotation_fields_new_name,
 5682                                    parquet_hdr_vcf_header_infos_number,
 5683                                    parquet_hdr_vcf_header_infos_type,
 5684                                    parquet_hdr_vcf_header_infos_description,
 5685                                    parquet_hdr_vcf_header_infos_source,
 5686                                    parquet_hdr_vcf_header_infos_version,
 5687                                    self.code_type_map[
 5688                                        parquet_hdr_vcf_header_infos_type
 5689                                    ],
 5690                                )
 5691                            )
 5692
 5693                            # Append
 5694                            if force_append_annotation:
 5695                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 5696                            else:
 5697                                query_case_when_append = ""
 5698
 5699                            # Annotation/Update query fields
 5700                            # Found in INFO column
 5701                            if (
 5702                                annotation_field_column == "INFO"
 5703                                and "INFO" in parquet_hdr_vcf_header_columns
 5704                            ):
 5705                                sql_query_annotation_update_info_sets.append(
 5706                                    f"""
 5707                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 5708                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 5709                                        ELSE ''
 5710                                    END
 5711                                """
 5712                                )
 5713                            # Found in a specific column
 5714                            else:
 5715                                sql_query_annotation_update_info_sets.append(
 5716                                    f"""
 5717                                CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
 5718                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ','))
 5719                                        ELSE ''
 5720                                    END
 5721                                """
 5722                                )
 5723                                sql_query_annotation_to_agregate.append(
 5724                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 5725                                )
 5726
 5727                        # Not to annotate
 5728                        else:
 5729
 5730                            if force_update_annotation:
 5731                                annotation_message = "forced"
 5732                            else:
 5733                                annotation_message = "skipped"
 5734
 5735                            if annotation_field not in parquet_hdr_vcf_header_infos:
 5736                                log.warning(
 5737                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 5738                                )
 5739                            if annotation_fields_new_name in self.get_header().infos:
 5740                                log.warning(
 5741                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 5742                                )
 5743
 5744                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 5745                    # allow_annotation_full_info = True
 5746                    allow_annotation_full_info = not force_append_annotation
 5747
 5748                    if parquet_type in ["regions"]:
 5749                        allow_annotation_full_info = False
 5750
 5751                    if (
 5752                        allow_annotation_full_info
 5753                        and nb_annotation_field == len(annotation_fields)
 5754                        and annotation_fields_all
 5755                        and (
 5756                            "INFO" in parquet_hdr_vcf_header_columns
 5757                            and "INFO" in database.get_extra_columns()
 5758                        )
 5759                    ):
 5760                        log.debug("Column INFO annotation enabled")
 5761                        sql_query_annotation_update_info_sets = []
 5762                        sql_query_annotation_update_info_sets.append(
 5763                            f" table_parquet.INFO "
 5764                        )
 5765
 5766                    if sql_query_annotation_update_info_sets:
 5767
 5768                        # Annotate
 5769                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 5770
 5771                        # Join query annotation update info sets for SQL
 5772                        sql_query_annotation_update_info_sets_sql = ",".join(
 5773                            sql_query_annotation_update_info_sets
 5774                        )
 5775
 5776                        # Check chromosomes list (and variants infos)
 5777                        sql_query_chromosomes = f"""
 5778                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 5779                            FROM {table_variants} as table_variants
 5780                            GROUP BY table_variants."#CHROM"
 5781                            ORDER BY table_variants."#CHROM"
 5782                            """
 5783                        sql_query_chromosomes_df = self.conn.execute(
 5784                            sql_query_chromosomes
 5785                        ).df()
 5786                        sql_query_chromosomes_dict = {
 5787                            entry["CHROM"]: {
 5788                                "count": entry["count_variants"],
 5789                                "min": entry["min_variants"],
 5790                                "max": entry["max_variants"],
 5791                            }
 5792                            for index, entry in sql_query_chromosomes_df.iterrows()
 5793                        }
 5794
 5795                        # Init
 5796                        nb_of_query = 0
 5797                        nb_of_variant_annotated = 0
 5798                        query_dict = query_dict_remove
 5799
 5800                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 5801                        for chrom in sql_query_chromosomes_dict:
 5802
 5803                            # Number of variant by chromosome
 5804                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 5805                                chrom, {}
 5806                            ).get("count", 0)
 5807
 5808                            log.debug(
 5809                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 5810                            )
 5811
 5812                            # Annotation with regions database
 5813                            if parquet_type in ["regions"]:
 5814                                sql_query_annotation_from_clause = f"""
 5815                                    FROM (
 5816                                        SELECT 
 5817                                            '{chrom}' AS \"#CHROM\",
 5818                                            table_variants_from.\"POS\" AS \"POS\",
 5819                                            {",".join(sql_query_annotation_to_agregate)}
 5820                                        FROM {table_variants} as table_variants_from
 5821                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 5822                                            table_parquet_from."#CHROM" = '{chrom}'
 5823                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 5824                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
 5825                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 5826                                                )
 5827                                        )
 5828                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 5829                                        GROUP BY table_variants_from.\"POS\"
 5830                                        )
 5831                                        as table_parquet
 5832                                """
 5833
 5834                                sql_query_annotation_where_clause = """
 5835                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 5836                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 5837                                """
 5838
 5839                            # Annotation with variants database
 5840                            else:
 5841                                sql_query_annotation_from_clause = f"""
 5842                                    FROM {parquet_file_link} as table_parquet
 5843                                """
 5844                                sql_query_annotation_where_clause = f"""
 5845                                    table_variants."#CHROM" = '{chrom}'
 5846                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 5847                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 5848                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5849                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5850                                """
 5851
 5852                            # Create update query
 5853                            sql_query_annotation_chrom_interval_pos = f"""
 5854                                UPDATE {table_variants} as table_variants
 5855                                    SET INFO = 
 5856                                        concat(
 5857                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 5858                                                THEN table_variants.INFO
 5859                                                ELSE ''
 5860                                            END
 5861                                            ,
 5862                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 5863                                                        AND (
 5864                                                        concat({sql_query_annotation_update_info_sets_sql})
 5865                                                        )
 5866                                                        NOT IN ('','.') 
 5867                                                    THEN ';'
 5868                                                    ELSE ''
 5869                                            END
 5870                                            ,
 5871                                            {sql_query_annotation_update_info_sets_sql}
 5872                                            )
 5873                                    {sql_query_annotation_from_clause}
 5874                                    WHERE {sql_query_annotation_where_clause}
 5875                                    ;
 5876                                """
 5877
 5878                            # Add update query to dict
 5879                            query_dict[
 5880                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 5881                            ] = sql_query_annotation_chrom_interval_pos
 5882
 5883                        nb_of_query = len(query_dict)
 5884                        num_query = 0
 5885
 5886                        # SET max_expression_depth TO x
 5887                        self.conn.execute("SET max_expression_depth TO 10000")
 5888
 5889                        for query_name in query_dict:
 5890                            query = query_dict[query_name]
 5891                            num_query += 1
 5892                            log.info(
 5893                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 5894                            )
 5895                            result = self.conn.execute(query)
 5896                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 5897                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 5898                            log.info(
 5899                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 5900                            )
 5901
 5902                        log.info(
 5903                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 5904                        )
 5905
 5906                    else:
 5907
 5908                        log.info(
 5909                            f"Annotation '{annotation_name}' - No Annotations available"
 5910                        )
 5911
 5912                    log.debug("Final header: " + str(vcf_reader.infos))
 5913
 5914        # Remove added columns
 5915        for added_column in added_columns:
 5916            self.drop_column(column=added_column)
 5917
 5918    def annotation_splice(self, threads: int = None) -> None:
 5919        """
 5920        This function annotate with snpEff
 5921
 5922        :param threads: The number of threads to use
 5923        :return: the value of the variable "return_value".
 5924        """
 5925
 5926        # DEBUG
 5927        log.debug("Start annotation with splice tools")
 5928
 5929        # Threads
 5930        if not threads:
 5931            threads = self.get_threads()
 5932        log.debug("Threads: " + str(threads))
 5933
 5934        # DEBUG
 5935        delete_tmp = True
 5936        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5937            delete_tmp = False
 5938            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5939
 5940        # Config
 5941        config = self.get_config()
 5942        log.debug("Config: " + str(config))
 5943        splice_config = config.get("tools", {}).get("splice", {})
 5944        if not splice_config:
 5945            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 5946        if not splice_config:
 5947            msg_err = "No Splice tool config"
 5948            log.error(msg_err)
 5949            raise ValueError(msg_err)
 5950        log.debug(f"splice_config={splice_config}")
 5951
 5952        # Config - Folders - Databases
 5953        databases_folders = (
 5954            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 5955        )
 5956        log.debug("Databases annotations: " + str(databases_folders))
 5957
 5958        # Splice docker image
 5959        splice_docker_image = splice_config.get("docker").get("image")
 5960
 5961        # Pull splice image if it's not already there
 5962        if not check_docker_image_exists(splice_docker_image):
 5963            log.warning(
 5964                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 5965            )
 5966            try:
 5967                command(f"docker pull {splice_config.get('docker').get('image')}")
 5968            except subprocess.CalledProcessError:
 5969                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 5970                log.error(msg_err)
 5971                raise ValueError(msg_err)
 5972                return None
 5973
 5974        # Config - splice databases
 5975        splice_databases = (
 5976            config.get("folders", {})
 5977            .get("databases", {})
 5978            .get("splice", DEFAULT_SPLICE_FOLDER)
 5979        )
 5980        splice_databases = full_path(splice_databases)
 5981
 5982        # Param
 5983        param = self.get_param()
 5984        log.debug("Param: " + str(param))
 5985
 5986        # Param
 5987        options = param.get("annotation", {}).get("splice", {})
 5988        log.debug("Options: " + str(options))
 5989
 5990        # Data
 5991        table_variants = self.get_table_variants()
 5992
 5993        # Check if not empty
 5994        log.debug("Check if not empty")
 5995        sql_query_chromosomes = (
 5996            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5997        )
 5998        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 5999            log.info("VCF empty")
 6000            return None
 6001
 6002        # Export in VCF
 6003        log.debug("Create initial file to annotate")
 6004
 6005        # Create output folder
 6006        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6007        if not os.path.exists(output_folder):
 6008            Path(output_folder).mkdir(parents=True, exist_ok=True)
 6009
 6010        # Create tmp VCF file
 6011        tmp_vcf = NamedTemporaryFile(
 6012            prefix=self.get_prefix(),
 6013            dir=output_folder,
 6014            suffix=".vcf",
 6015            delete=False,
 6016        )
 6017        tmp_vcf_name = tmp_vcf.name
 6018
 6019        # VCF header
 6020        header = self.get_header()
 6021
 6022        # Existing annotations
 6023        for vcf_annotation in self.get_header().infos:
 6024
 6025            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6026            log.debug(
 6027                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6028            )
 6029
 6030        # Memory limit
 6031        if config.get("memory", None):
 6032            memory_limit = config.get("memory", "8G").upper()
 6033            # upper()
 6034        else:
 6035            memory_limit = "8G"
 6036        log.debug(f"memory_limit: {memory_limit}")
 6037
 6038        # Check number of variants to annotate
 6039        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6040        where_clause_regex_spip = r"SPiP_\w+"
 6041        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6042        df_list_of_variants_to_annotate = self.get_query_to_df(
 6043            query=f""" SELECT * FROM variants {where_clause} """
 6044        )
 6045        if len(df_list_of_variants_to_annotate) == 0:
 6046            log.warning(
 6047                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6048            )
 6049            return None
 6050        else:
 6051            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6052
 6053        # Export VCF file
 6054        self.export_variant_vcf(
 6055            vcf_file=tmp_vcf_name,
 6056            remove_info=True,
 6057            add_samples=True,
 6058            index=False,
 6059            where_clause=where_clause,
 6060        )
 6061
 6062        # Create docker container and launch splice analysis
 6063        if splice_config:
 6064
 6065            # Splice mount folders
 6066            mount_folders = splice_config.get("mount", {})
 6067
 6068            # Genome mount
 6069            mount_folders[
 6070                config.get("folders", {})
 6071                .get("databases", {})
 6072                .get("genomes", DEFAULT_GENOME_FOLDER)
 6073            ] = "ro"
 6074
 6075            # SpliceAI mount
 6076            mount_folders[
 6077                config.get("folders", {})
 6078                .get("databases", {})
 6079                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
 6080            ] = "ro"
 6081
 6082            # Genome mount
 6083            mount_folders[
 6084                config.get("folders", {})
 6085                .get("databases", {})
 6086                .get("spip", DEFAULT_SPIP_FOLDER)
 6087            ] = "ro"
 6088
 6089            # Mount folders
 6090            mount = []
 6091
 6092            # Config mount
 6093            mount = [
 6094                f"-v {full_path(path)}:{full_path(path)}:{mode}"
 6095                for path, mode in mount_folders.items()
 6096            ]
 6097
 6098            if any(value for value in splice_config.values() if value is None):
 6099                log.warning("At least one splice config parameter is empty")
 6100                return None
 6101
 6102            # Params in splice nf
 6103            def check_values(dico: dict):
 6104                """
 6105                Ensure parameters for NF splice pipeline
 6106                """
 6107                for key, val in dico.items():
 6108                    if key == "genome":
 6109                        if any(
 6110                            assemb in options.get("genome", {})
 6111                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6112                        ):
 6113                            yield f"--{key} hg19"
 6114                        elif any(
 6115                            assemb in options.get("genome", {})
 6116                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6117                        ):
 6118                            yield f"--{key} hg38"
 6119                    elif (
 6120                        (isinstance(val, str) and val)
 6121                        or isinstance(val, int)
 6122                        or isinstance(val, bool)
 6123                    ):
 6124                        yield f"--{key} {val}"
 6125
 6126            # Genome
 6127            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6128            options["genome"] = genome
 6129
 6130            # NF params
 6131            nf_params = []
 6132
 6133            # Add options
 6134            if options:
 6135                nf_params = list(check_values(options))
 6136                log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6137            else:
 6138                log.debug("No NF params provided")
 6139
 6140            # Add threads
 6141            if "threads" not in options.keys():
 6142                nf_params.append(f"--threads {threads}")
 6143
 6144            # Genome path
 6145            genome_path = find_genome(
 6146                config.get("folders", {})
 6147                .get("databases", {})
 6148                .get("genomes", DEFAULT_GENOME_FOLDER),
 6149                file=f"{genome}.fa",
 6150            )
 6151            # Add genome path
 6152            if not genome_path:
 6153                raise ValueError(
 6154                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6155                )
 6156            else:
 6157                log.debug(f"Genome: {genome_path}")
 6158                nf_params.append(f"--genome_path {genome_path}")
 6159
 6160            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6161                """
 6162                Setting up updated databases for SPiP and SpliceAI
 6163                """
 6164
 6165                try:
 6166
 6167                    # SpliceAI assembly transcriptome
 6168                    spliceai_assembly = os.path.join(
 6169                        config.get("folders", {})
 6170                        .get("databases", {})
 6171                        .get("spliceai", {}),
 6172                        options.get("genome"),
 6173                        "transcriptome",
 6174                    )
 6175                    spip_assembly = options.get("genome")
 6176
 6177                    spip = find(
 6178                        f"transcriptome_{spip_assembly}.RData",
 6179                        config.get("folders", {}).get("databases", {}).get("spip", {}),
 6180                    )
 6181                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6182                    log.debug(f"SPiP annotations: {spip}")
 6183                    log.debug(f"SpliceAI annotations: {spliceai}")
 6184                    if spip and spliceai:
 6185                        return [
 6186                            f"--spip_transcriptome {spip}",
 6187                            f"--spliceai_annotations {spliceai}",
 6188                        ]
 6189                    else:
 6190                        # TODO crash and go on with basic annotations ?
 6191                        # raise ValueError(
 6192                        #     "Can't find splice databases in configuration EXIT"
 6193                        # )
 6194                        log.warning(
 6195                            "Can't find splice databases in configuration, use annotations file from image"
 6196                        )
 6197                except TypeError:
 6198                    log.warning(
 6199                        "Can't find splice databases in configuration, use annotations file from image"
 6200                    )
 6201                    return []
 6202
 6203            # Add options, check if transcriptome option have already beend provided
 6204            if (
 6205                "spip_transcriptome" not in nf_params
 6206                and "spliceai_transcriptome" not in nf_params
 6207            ):
 6208                splice_reference = splice_annotations(options, config)
 6209                if splice_reference:
 6210                    nf_params.extend(splice_reference)
 6211
 6212            nf_params.append(f"--output_folder {output_folder}")
 6213
 6214            random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6215            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6216            log.debug(cmd)
 6217
 6218            splice_config["docker"]["command"] = cmd
 6219
 6220            docker_cmd = get_bin_command(
 6221                tool="splice",
 6222                bin_type="docker",
 6223                config=config,
 6224                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6225                add_options=f"--name {random_uuid} {' '.join(mount)}",
 6226            )
 6227
 6228            # Docker debug
 6229            # if splice_config.get("rm_container"):
 6230            #     rm_container = "--rm"
 6231            # else:
 6232            #     rm_container = ""
 6233            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6234
 6235            log.debug(docker_cmd)
 6236            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6237            log.debug(res.stdout)
 6238            if res.stderr:
 6239                log.error(res.stderr)
 6240            res.check_returncode()
 6241        else:
 6242            log.warning(f"Splice tool configuration not found: {config}")
 6243
 6244        # Update variants
 6245        log.info("Annotation - Updating...")
 6246        # Test find output vcf
 6247        log.debug(
 6248            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6249        )
 6250        output_vcf = []
 6251        # Wrong folder to look in
 6252        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6253            if (
 6254                files
 6255                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6256            ):
 6257                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6258        # log.debug(os.listdir(options.get("output_folder")))
 6259        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6260        if not output_vcf:
 6261            log.debug(
 6262                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6263            )
 6264        else:
 6265            # Get new header from annotated vcf
 6266            log.debug(f"Initial header: {len(header.infos)} fields")
 6267            # Create new header with splice infos
 6268            new_vcf = Variants(input=output_vcf[0])
 6269            new_vcf_header = new_vcf.get_header().infos
 6270            for keys, infos in new_vcf_header.items():
 6271                if keys not in header.infos.keys():
 6272                    header.infos[keys] = infos
 6273            log.debug(f"New header: {len(header.infos)} fields")
 6274            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6275            self.update_from_vcf(output_vcf[0])
 6276
 6277        # Remove folder
 6278        remove_if_exists(output_folder)
 6279
 6280    ###
 6281    # Prioritization
 6282    ###
 6283
 6284    def get_config_default(self, name: str) -> dict:
 6285        """
 6286        The function `get_config_default` returns a dictionary containing default configurations for
 6287        various calculations and prioritizations.
 6288
 6289        :param name: The `get_config_default` function returns a dictionary containing default
 6290        configurations for different calculations and prioritizations. The `name` parameter is used to
 6291        specify which specific configuration to retrieve from the dictionary
 6292        :type name: str
 6293        :return: The function `get_config_default` returns a dictionary containing default configuration
 6294        settings for different calculations and prioritizations. The specific configuration settings are
 6295        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6296        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6297        returned. If there is no match, an empty dictionary is returned.
 6298        """
 6299
 6300        config_default = {
 6301            "calculations": {
 6302                "variant_chr_pos_alt_ref": {
 6303                    "type": "sql",
 6304                    "name": "variant_chr_pos_alt_ref",
 6305                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6306                    "available": False,
 6307                    "output_column_name": "variant_chr_pos_alt_ref",
 6308                    "output_column_type": "String",
 6309                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6310                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6311                    "operation_info": True,
 6312                },
 6313                "VARTYPE": {
 6314                    "type": "sql",
 6315                    "name": "VARTYPE",
 6316                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6317                    "available": True,
 6318                    "output_column_name": "VARTYPE",
 6319                    "output_column_type": "String",
 6320                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6321                    "operation_query": """
 6322                            CASE
 6323                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6324                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6325                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6326                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6327                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6328                                ELSE 'UNDEFINED'
 6329                            END
 6330                            """,
 6331                    "info_fields": ["SVTYPE"],
 6332                    "operation_info": True,
 6333                },
 6334                "snpeff_hgvs": {
 6335                    "type": "python",
 6336                    "name": "snpeff_hgvs",
 6337                    "description": "HGVS nomenclatures from snpEff annotation",
 6338                    "available": True,
 6339                    "function_name": "calculation_extract_snpeff_hgvs",
 6340                    "function_params": ["snpeff_hgvs", "ANN"],
 6341                },
 6342                "snpeff_ann_explode": {
 6343                    "type": "python",
 6344                    "name": "snpeff_ann_explode",
 6345                    "description": "Explode snpEff annotations with uniquify values",
 6346                    "available": True,
 6347                    "function_name": "calculation_snpeff_ann_explode",
 6348                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6349                },
 6350                "snpeff_ann_explode_uniquify": {
 6351                    "type": "python",
 6352                    "name": "snpeff_ann_explode_uniquify",
 6353                    "description": "Explode snpEff annotations",
 6354                    "available": True,
 6355                    "function_name": "calculation_snpeff_ann_explode",
 6356                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6357                },
 6358                "snpeff_ann_explode_json": {
 6359                    "type": "python",
 6360                    "name": "snpeff_ann_explode_json",
 6361                    "description": "Explode snpEff annotations in JSON format",
 6362                    "available": True,
 6363                    "function_name": "calculation_snpeff_ann_explode",
 6364                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6365                },
 6366                "NOMEN": {
 6367                    "type": "python",
 6368                    "name": "NOMEN",
 6369                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
 6370                    "available": True,
 6371                    "function_name": "calculation_extract_nomen",
 6372                    "function_params": [],
 6373                },
 6374                "FINDBYPIPELINE": {
 6375                    "type": "python",
 6376                    "name": "FINDBYPIPELINE",
 6377                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6378                    "available": True,
 6379                    "function_name": "calculation_find_by_pipeline",
 6380                    "function_params": ["findbypipeline"],
 6381                },
 6382                "FINDBYSAMPLE": {
 6383                    "type": "python",
 6384                    "name": "FINDBYSAMPLE",
 6385                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6386                    "available": True,
 6387                    "function_name": "calculation_find_by_pipeline",
 6388                    "function_params": ["findbysample"],
 6389                },
 6390                "GENOTYPECONCORDANCE": {
 6391                    "type": "python",
 6392                    "name": "GENOTYPECONCORDANCE",
 6393                    "description": "Concordance of genotype for multi caller VCF",
 6394                    "available": True,
 6395                    "function_name": "calculation_genotype_concordance",
 6396                    "function_params": [],
 6397                },
 6398                "BARCODE": {
 6399                    "type": "python",
 6400                    "name": "BARCODE",
 6401                    "description": "BARCODE as VaRank tool",
 6402                    "available": True,
 6403                    "function_name": "calculation_barcode",
 6404                    "function_params": [],
 6405                },
 6406                "BARCODEFAMILY": {
 6407                    "type": "python",
 6408                    "name": "BARCODEFAMILY",
 6409                    "description": "BARCODEFAMILY as VaRank tool",
 6410                    "available": True,
 6411                    "function_name": "calculation_barcode_family",
 6412                    "function_params": ["BCF"],
 6413                },
 6414                "TRIO": {
 6415                    "type": "python",
 6416                    "name": "TRIO",
 6417                    "description": "Inheritance for a trio family",
 6418                    "available": True,
 6419                    "function_name": "calculation_trio",
 6420                    "function_params": [],
 6421                },
 6422                "VAF": {
 6423                    "type": "python",
 6424                    "name": "VAF",
 6425                    "description": "Variant Allele Frequency (VAF) harmonization",
 6426                    "available": True,
 6427                    "function_name": "calculation_vaf_normalization",
 6428                    "function_params": [],
 6429                },
 6430                "VAF_stats": {
 6431                    "type": "python",
 6432                    "name": "VAF_stats",
 6433                    "description": "Variant Allele Frequency (VAF) statistics",
 6434                    "available": True,
 6435                    "function_name": "calculation_genotype_stats",
 6436                    "function_params": ["VAF"],
 6437                },
 6438                "DP_stats": {
 6439                    "type": "python",
 6440                    "name": "DP_stats",
 6441                    "description": "Depth (DP) statistics",
 6442                    "available": True,
 6443                    "function_name": "calculation_genotype_stats",
 6444                    "function_params": ["DP"],
 6445                },
 6446                "variant_id": {
 6447                    "type": "python",
 6448                    "name": "variant_id",
 6449                    "description": "Variant ID generated from variant position and type",
 6450                    "available": True,
 6451                    "function_name": "calculation_variant_id",
 6452                    "function_params": [],
 6453                },
 6454                "transcripts_json": {
 6455                    "type": "python",
 6456                    "name": "transcripts_json",
 6457                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6458                    "available": True,
 6459                    "function_name": "calculation_transcripts_annotation",
 6460                    "function_params": ["transcripts_json", None],
 6461                },
 6462                "transcripts_ann": {
 6463                    "type": "python",
 6464                    "name": "transcripts_ann",
 6465                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6466                    "available": True,
 6467                    "function_name": "calculation_transcripts_annotation",
 6468                    "function_params": [None, "transcripts_ann"],
 6469                },
 6470                "transcripts_annotations": {
 6471                    "type": "python",
 6472                    "name": "transcripts_annotations",
 6473                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6474                    "available": True,
 6475                    "function_name": "calculation_transcripts_annotation",
 6476                    "function_params": [None, None],
 6477                },
 6478                "transcripts_prioritization": {
 6479                    "type": "python",
 6480                    "name": "transcripts_prioritization",
 6481                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6482                    "available": True,
 6483                    "function_name": "calculation_transcripts_prioritization",
 6484                    "function_params": [],
 6485                },
 6486            },
 6487            "prioritizations": {
 6488                "default": {
 6489                    "filter": [
 6490                        {
 6491                            "type": "notequals",
 6492                            "value": "!PASS|\\.",
 6493                            "score": 0,
 6494                            "flag": "FILTERED",
 6495                            "comment": ["Bad variant quality"],
 6496                        },
 6497                        {
 6498                            "type": "equals",
 6499                            "value": "REJECT",
 6500                            "score": -20,
 6501                            "flag": "PASS",
 6502                            "comment": ["Bad variant quality"],
 6503                        },
 6504                    ],
 6505                    "DP": [
 6506                        {
 6507                            "type": "gte",
 6508                            "value": "50",
 6509                            "score": 5,
 6510                            "flag": "PASS",
 6511                            "comment": ["DP higher than 50"],
 6512                        }
 6513                    ],
 6514                    "ANN": [
 6515                        {
 6516                            "type": "contains",
 6517                            "value": "HIGH",
 6518                            "score": 5,
 6519                            "flag": "PASS",
 6520                            "comment": [
 6521                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6522                            ],
 6523                        },
 6524                        {
 6525                            "type": "contains",
 6526                            "value": "MODERATE",
 6527                            "score": 3,
 6528                            "flag": "PASS",
 6529                            "comment": [
 6530                                "A non-disruptive variant that might change protein effectiveness"
 6531                            ],
 6532                        },
 6533                        {
 6534                            "type": "contains",
 6535                            "value": "LOW",
 6536                            "score": 0,
 6537                            "flag": "FILTERED",
 6538                            "comment": [
 6539                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 6540                            ],
 6541                        },
 6542                        {
 6543                            "type": "contains",
 6544                            "value": "MODIFIER",
 6545                            "score": 0,
 6546                            "flag": "FILTERED",
 6547                            "comment": [
 6548                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 6549                            ],
 6550                        },
 6551                    ],
 6552                }
 6553            },
 6554        }
 6555
 6556        return config_default.get(name, None)
 6557
 6558    def get_config_json(
 6559        self, name: str, config_dict: dict = {}, config_file: str = None
 6560    ) -> dict:
 6561        """
 6562        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 6563        default values, a dictionary, and a file.
 6564
 6565        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 6566        the name of the configuration. It is used to identify and retrieve the configuration settings
 6567        for a specific component or module
 6568        :type name: str
 6569        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 6570        dictionary that allows you to provide additional configuration settings or overrides. When you
 6571        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 6572        the key is the configuration setting you want to override or
 6573        :type config_dict: dict
 6574        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 6575        specify the path to a configuration file that contains additional settings. If provided, the
 6576        function will read the contents of this file and update the configuration dictionary with the
 6577        values found in the file, overriding any existing values with the
 6578        :type config_file: str
 6579        :return: The function `get_config_json` returns a dictionary containing the configuration
 6580        settings.
 6581        """
 6582
 6583        # Create with default prioritizations
 6584        config_default = self.get_config_default(name=name)
 6585        configuration = config_default
 6586        # log.debug(f"configuration={configuration}")
 6587
 6588        # Replace prioritizations from dict
 6589        for config in config_dict:
 6590            configuration[config] = config_dict[config]
 6591
 6592        # Replace prioritizations from file
 6593        config_file = full_path(config_file)
 6594        if config_file:
 6595            if os.path.exists(config_file):
 6596                with open(config_file) as config_file_content:
 6597                    config_file_dict = json.load(config_file_content)
 6598                for config in config_file_dict:
 6599                    configuration[config] = config_file_dict[config]
 6600            else:
 6601                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 6602                log.error(msg_error)
 6603                raise ValueError(msg_error)
 6604
 6605        return configuration
 6606
 6607    def prioritization(
 6608        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 6609    ) -> bool:
 6610        """
 6611        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 6612        prioritizes variants based on configured profiles and criteria.
 6613
 6614        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 6615        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 6616        a table name is provided, the method will prioritize the variants in that specific table
 6617        :type table: str
 6618        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 6619        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 6620        provided, the code will use a default prefix value of "PZ"
 6621        :type pz_prefix: str
 6622        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 6623        additional parameters specific to the prioritization process. These parameters can include
 6624        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 6625        configurations needed for the prioritization of variants in a V
 6626        :type pz_param: dict
 6627        :return: A boolean value (True) is being returned from the `prioritization` function.
 6628        """
 6629
 6630        # Config
 6631        config = self.get_config()
 6632
 6633        # Param
 6634        param = self.get_param()
 6635
 6636        # Prioritization param
 6637        if pz_param is not None:
 6638            prioritization_param = pz_param
 6639        else:
 6640            prioritization_param = param.get("prioritization", {})
 6641
 6642        # Configuration profiles
 6643        prioritization_config_file = prioritization_param.get(
 6644            "prioritization_config", None
 6645        )
 6646        prioritization_config_file = full_path(prioritization_config_file)
 6647        prioritizations_config = self.get_config_json(
 6648            name="prioritizations", config_file=prioritization_config_file
 6649        )
 6650
 6651        # Prioritization prefix
 6652        pz_prefix_default = "PZ"
 6653        if pz_prefix is None:
 6654            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 6655
 6656        # Prioritization options
 6657        profiles = prioritization_param.get("profiles", [])
 6658        if isinstance(profiles, str):
 6659            profiles = profiles.split(",")
 6660        pzfields = prioritization_param.get(
 6661            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 6662        )
 6663        if isinstance(pzfields, str):
 6664            pzfields = pzfields.split(",")
 6665        default_profile = prioritization_param.get("default_profile", None)
 6666        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 6667        prioritization_score_mode = prioritization_param.get(
 6668            "prioritization_score_mode", "HOWARD"
 6669        )
 6670
 6671        # Quick Prioritizations
 6672        prioritizations = param.get("prioritizations", None)
 6673        if prioritizations:
 6674            log.info("Quick Prioritization:")
 6675            for profile in prioritizations.split(","):
 6676                if profile not in profiles:
 6677                    profiles.append(profile)
 6678                    log.info(f"   {profile}")
 6679
 6680        # If profile "ALL" provided, all profiles in the config profiles
 6681        if "ALL" in profiles:
 6682            profiles = list(prioritizations_config.keys())
 6683
 6684        for profile in profiles:
 6685            if prioritizations_config.get(profile, None):
 6686                log.debug(f"Profile '{profile}' configured")
 6687            else:
 6688                msg_error = f"Profile '{profile}' NOT configured"
 6689                log.error(msg_error)
 6690                raise ValueError(msg_error)
 6691
 6692        if profiles:
 6693            log.info(f"Prioritization... ")
 6694        else:
 6695            log.debug(f"No profile defined")
 6696            return False
 6697
 6698        if not default_profile and len(profiles):
 6699            default_profile = profiles[0]
 6700
 6701        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 6702        log.debug("Profiles to check: " + str(list(profiles)))
 6703
 6704        # Variables
 6705        if table is not None:
 6706            table_variants = table
 6707        else:
 6708            table_variants = self.get_table_variants(clause="update")
 6709        log.debug(f"Table to prioritize: {table_variants}")
 6710
 6711        # Added columns
 6712        added_columns = []
 6713
 6714        # Create list of PZfields
 6715        # List of PZFields
 6716        list_of_pzfields_original = pzfields + [
 6717            pzfield + pzfields_sep + profile
 6718            for pzfield in pzfields
 6719            for profile in profiles
 6720        ]
 6721        list_of_pzfields = []
 6722        log.debug(f"{list_of_pzfields_original}")
 6723
 6724        # Remove existing PZfields to use if exists
 6725        for pzfield in list_of_pzfields_original:
 6726            if self.get_header().infos.get(pzfield, None) is None:
 6727                list_of_pzfields.append(pzfield)
 6728                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 6729            else:
 6730                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 6731
 6732        if list_of_pzfields:
 6733
 6734            # Explode Infos prefix
 6735            explode_infos_prefix = self.get_explode_infos_prefix()
 6736
 6737            # PZfields tags description
 6738            PZfields_INFOS = {
 6739                f"{pz_prefix}Tags": {
 6740                    "ID": f"{pz_prefix}Tags",
 6741                    "Number": ".",
 6742                    "Type": "String",
 6743                    "Description": "Variant tags based on annotation criteria",
 6744                },
 6745                f"{pz_prefix}Score": {
 6746                    "ID": f"{pz_prefix}Score",
 6747                    "Number": 1,
 6748                    "Type": "Integer",
 6749                    "Description": "Variant score based on annotation criteria",
 6750                },
 6751                f"{pz_prefix}Flag": {
 6752                    "ID": f"{pz_prefix}Flag",
 6753                    "Number": 1,
 6754                    "Type": "String",
 6755                    "Description": "Variant flag based on annotation criteria",
 6756                },
 6757                f"{pz_prefix}Comment": {
 6758                    "ID": f"{pz_prefix}Comment",
 6759                    "Number": ".",
 6760                    "Type": "String",
 6761                    "Description": "Variant comment based on annotation criteria",
 6762                },
 6763                f"{pz_prefix}Infos": {
 6764                    "ID": f"{pz_prefix}Infos",
 6765                    "Number": ".",
 6766                    "Type": "String",
 6767                    "Description": "Variant infos based on annotation criteria",
 6768                },
 6769            }
 6770
 6771            # Create INFO fields if not exist
 6772            for field in PZfields_INFOS:
 6773                field_ID = PZfields_INFOS[field]["ID"]
 6774                field_description = PZfields_INFOS[field]["Description"]
 6775                if field_ID not in self.get_header().infos and field_ID in pzfields:
 6776                    field_description = (
 6777                        PZfields_INFOS[field]["Description"]
 6778                        + f", profile {default_profile}"
 6779                    )
 6780                    self.get_header().infos[field_ID] = vcf.parser._Info(
 6781                        field_ID,
 6782                        PZfields_INFOS[field]["Number"],
 6783                        PZfields_INFOS[field]["Type"],
 6784                        field_description,
 6785                        "unknown",
 6786                        "unknown",
 6787                        code_type_map[PZfields_INFOS[field]["Type"]],
 6788                    )
 6789
 6790            # Create INFO fields if not exist for each profile
 6791            for profile in prioritizations_config:
 6792                if profile in profiles or profiles == []:
 6793                    for field in PZfields_INFOS:
 6794                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 6795                        field_description = (
 6796                            PZfields_INFOS[field]["Description"]
 6797                            + f", profile {profile}"
 6798                        )
 6799                        if (
 6800                            field_ID not in self.get_header().infos
 6801                            and field in pzfields
 6802                        ):
 6803                            self.get_header().infos[field_ID] = vcf.parser._Info(
 6804                                field_ID,
 6805                                PZfields_INFOS[field]["Number"],
 6806                                PZfields_INFOS[field]["Type"],
 6807                                field_description,
 6808                                "unknown",
 6809                                "unknown",
 6810                                code_type_map[PZfields_INFOS[field]["Type"]],
 6811                            )
 6812
 6813            # Header
 6814            for pzfield in list_of_pzfields:
 6815                if re.match(f"{pz_prefix}Score.*", pzfield):
 6816                    added_column = self.add_column(
 6817                        table_name=table_variants,
 6818                        column_name=pzfield,
 6819                        column_type="INTEGER",
 6820                        default_value="0",
 6821                    )
 6822                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 6823                    added_column = self.add_column(
 6824                        table_name=table_variants,
 6825                        column_name=pzfield,
 6826                        column_type="BOOLEAN",
 6827                        default_value="1",
 6828                    )
 6829                else:
 6830                    added_column = self.add_column(
 6831                        table_name=table_variants,
 6832                        column_name=pzfield,
 6833                        column_type="STRING",
 6834                        default_value="''",
 6835                    )
 6836                added_columns.append(added_column)
 6837
 6838            # Profiles
 6839            if profiles:
 6840
 6841                # foreach profile in configuration file
 6842                for profile in prioritizations_config:
 6843
 6844                    # If profile is asked in param, or ALL are asked (empty profile [])
 6845                    if profile in profiles or profiles == []:
 6846                        log.info(f"Profile '{profile}'")
 6847
 6848                        sql_set_info_option = ""
 6849
 6850                        sql_set_info = []
 6851
 6852                        # PZ fields set
 6853
 6854                        # PZScore
 6855                        if (
 6856                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 6857                            in list_of_pzfields
 6858                        ):
 6859                            sql_set_info.append(
 6860                                f"""
 6861                                    concat(
 6862                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 6863                                        {pz_prefix}Score{pzfields_sep}{profile}
 6864                                    ) 
 6865                                """
 6866                            )
 6867                            if (
 6868                                profile == default_profile
 6869                                and f"{pz_prefix}Score" in list_of_pzfields
 6870                            ):
 6871                                sql_set_info.append(
 6872                                    f"""
 6873                                        concat(
 6874                                            '{pz_prefix}Score=',
 6875                                            {pz_prefix}Score{pzfields_sep}{profile}
 6876                                        )
 6877                                    """
 6878                                )
 6879
 6880                        # PZFlag
 6881                        if (
 6882                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 6883                            in list_of_pzfields
 6884                        ):
 6885                            sql_set_info.append(
 6886                                f"""
 6887                                    concat(
 6888                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 6889                                        CASE 
 6890                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 6891                                            THEN 'PASS'
 6892                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 6893                                            THEN 'FILTERED'
 6894                                        END
 6895                                    ) 
 6896                                """
 6897                            )
 6898                            if (
 6899                                profile == default_profile
 6900                                and f"{pz_prefix}Flag" in list_of_pzfields
 6901                            ):
 6902                                sql_set_info.append(
 6903                                    f"""
 6904                                        concat(
 6905                                            '{pz_prefix}Flag=',
 6906                                            CASE 
 6907                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 6908                                                THEN 'PASS'
 6909                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 6910                                                THEN 'FILTERED'
 6911                                            END
 6912                                        )
 6913                                    """
 6914                                )
 6915
 6916                        # PZComment
 6917                        if (
 6918                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 6919                            in list_of_pzfields
 6920                        ):
 6921                            sql_set_info.append(
 6922                                f"""
 6923                                    CASE
 6924                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 6925                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 6926                                        ELSE ''
 6927                                    END
 6928                                """
 6929                            )
 6930                            if (
 6931                                profile == default_profile
 6932                                and f"{pz_prefix}Comment" in list_of_pzfields
 6933                            ):
 6934                                sql_set_info.append(
 6935                                    f"""
 6936                                        CASE
 6937                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 6938                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 6939                                            ELSE ''
 6940                                        END
 6941                                    """
 6942                                )
 6943
 6944                        # PZInfos
 6945                        if (
 6946                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 6947                            in list_of_pzfields
 6948                        ):
 6949                            sql_set_info.append(
 6950                                f"""
 6951                                    CASE
 6952                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 6953                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 6954                                        ELSE ''
 6955                                    END
 6956                                """
 6957                            )
 6958                            if (
 6959                                profile == default_profile
 6960                                and f"{pz_prefix}Infos" in list_of_pzfields
 6961                            ):
 6962                                sql_set_info.append(
 6963                                    f"""
 6964                                        CASE
 6965                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 6966                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 6967                                            ELSE ''
 6968                                        END
 6969                                    """
 6970                                )
 6971
 6972                        # Merge PZfields
 6973                        sql_set_info_option = ""
 6974                        sql_set_sep = ""
 6975                        for sql_set in sql_set_info:
 6976                            if sql_set_sep:
 6977                                sql_set_info_option += f"""
 6978                                    , concat('{sql_set_sep}', {sql_set})
 6979                                """
 6980                            else:
 6981                                sql_set_info_option += f"""
 6982                                    , {sql_set}
 6983                                """
 6984                            sql_set_sep = ";"
 6985
 6986                        sql_queries = []
 6987                        for annotation in prioritizations_config[profile]:
 6988
 6989                            # Explode specific annotation
 6990                            log.debug(f"Explode annotation '{annotation}'")
 6991                            added_columns += self.explode_infos(
 6992                                prefix=explode_infos_prefix,
 6993                                fields=[annotation],
 6994                                table=table_variants,
 6995                            )
 6996                            extra_infos = self.get_extra_infos(table=table_variants)
 6997
 6998                            # Check if annotation field is present
 6999                            if not f"{explode_infos_prefix}{annotation}" in extra_infos:
 7000                                log.debug(f"Annotation '{annotation}' not in data")
 7001                                continue
 7002                            else:
 7003                                log.debug(f"Annotation '{annotation}' in data")
 7004
 7005                            # For each criterions
 7006                            for criterion in prioritizations_config[profile][
 7007                                annotation
 7008                            ]:
 7009                                criterion_type = criterion["type"]
 7010                                criterion_value = criterion["value"]
 7011                                criterion_score = criterion.get("score", 0)
 7012                                criterion_flag = criterion.get("flag", "PASS")
 7013                                criterion_flag_bool = criterion_flag == "PASS"
 7014                                criterion_comment = (
 7015                                    ", ".join(criterion.get("comment", []))
 7016                                    .replace("'", "''")
 7017                                    .replace(";", ",")
 7018                                    .replace("\t", " ")
 7019                                )
 7020                                criterion_infos = (
 7021                                    str(criterion)
 7022                                    .replace("'", "''")
 7023                                    .replace(";", ",")
 7024                                    .replace("\t", " ")
 7025                                )
 7026
 7027                                sql_set = []
 7028                                sql_set_info = []
 7029
 7030                                # PZ fields set
 7031                                if (
 7032                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7033                                    in list_of_pzfields
 7034                                ):
 7035                                    if prioritization_score_mode == "HOWARD":
 7036                                        sql_set.append(
 7037                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7038                                        )
 7039                                    elif prioritization_score_mode == "VaRank":
 7040                                        sql_set.append(
 7041                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
 7042                                        )
 7043                                    else:
 7044                                        sql_set.append(
 7045                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7046                                        )
 7047                                if (
 7048                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7049                                    in list_of_pzfields
 7050                                ):
 7051                                    sql_set.append(
 7052                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7053                                    )
 7054                                if (
 7055                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7056                                    in list_of_pzfields
 7057                                ):
 7058                                    sql_set.append(
 7059                                        f"""
 7060                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7061                                                concat(
 7062                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7063                                                    CASE 
 7064                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7065                                                        THEN ', '
 7066                                                        ELSE ''
 7067                                                    END,
 7068                                                    '{criterion_comment}'
 7069                                                )
 7070                                        """
 7071                                    )
 7072                                if (
 7073                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7074                                    in list_of_pzfields
 7075                                ):
 7076                                    sql_set.append(
 7077                                        f"""
 7078                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7079                                                concat(
 7080                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7081                                                    '{criterion_infos}'
 7082                                                )
 7083                                        """
 7084                                    )
 7085                                sql_set_option = ",".join(sql_set)
 7086
 7087                                # Criterion and comparison
 7088                                if sql_set_option:
 7089                                    try:
 7090                                        float(criterion_value)
 7091                                        sql_update = f"""
 7092                                            UPDATE {table_variants}
 7093                                            SET {sql_set_option}
 7094                                            WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7095                                            AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7096                                            """
 7097                                    except:
 7098                                        contains_option = ""
 7099                                        if criterion_type == "contains":
 7100                                            contains_option = ".*"
 7101                                        sql_update = f"""
 7102                                            UPDATE {table_variants}
 7103                                            SET {sql_set_option}
 7104                                            WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7105                                            """
 7106                                    sql_queries.append(sql_update)
 7107                                else:
 7108                                    log.warning(
 7109                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7110                                    )
 7111
 7112                        # PZTags
 7113                        if (
 7114                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7115                            in list_of_pzfields
 7116                        ):
 7117
 7118                            # Create PZFalgs value
 7119                            pztags_value = ""
 7120                            pztags_sep_default = "|"
 7121                            pztags_sep = ""
 7122                            for pzfield in pzfields:
 7123                                if pzfield not in [f"{pz_prefix}Tags"]:
 7124                                    if (
 7125                                        f"{pzfield}{pzfields_sep}{profile}"
 7126                                        in list_of_pzfields
 7127                                    ):
 7128                                        if pzfield in [f"{pz_prefix}Flag"]:
 7129                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7130                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7131                                                    THEN 'PASS'
 7132                                                    ELSE 'FILTERED'
 7133                                                END, '"""
 7134                                        else:
 7135                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7136                                        pztags_sep = pztags_sep_default
 7137
 7138                            # Add Query update for PZFlags
 7139                            sql_update_pztags = f"""
 7140                                UPDATE {table_variants}
 7141                                SET INFO = concat(
 7142                                        INFO,
 7143                                        CASE WHEN INFO NOT in ('','.')
 7144                                                THEN ';'
 7145                                                ELSE ''
 7146                                        END,
 7147                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7148                                    )
 7149                                """
 7150                            sql_queries.append(sql_update_pztags)
 7151
 7152                            # Add Query update for PZFlags for default
 7153                            if profile == default_profile:
 7154                                sql_update_pztags_default = f"""
 7155                                UPDATE {table_variants}
 7156                                SET INFO = concat(
 7157                                        INFO,
 7158                                        ';',
 7159                                        '{pz_prefix}Tags={pztags_value}'
 7160                                    )
 7161                                """
 7162                                sql_queries.append(sql_update_pztags_default)
 7163
 7164                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7165
 7166                        if sql_queries:
 7167
 7168                            for sql_query in sql_queries:
 7169                                log.debug(
 7170                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7171                                )
 7172                                self.conn.execute(sql_query)
 7173
 7174                        log.info(f"""Profile '{profile}' - Update... """)
 7175                        sql_query_update = f"""
 7176                            UPDATE {table_variants}
 7177                            SET INFO =  
 7178                                concat(
 7179                                    CASE
 7180                                        WHEN INFO NOT IN ('','.')
 7181                                        THEN concat(INFO, ';')
 7182                                        ELSE ''
 7183                                    END
 7184                                    {sql_set_info_option}
 7185                                )
 7186                        """
 7187                        self.conn.execute(sql_query_update)
 7188
 7189        else:
 7190
 7191            log.warning(f"No profiles in parameters")
 7192
 7193        # Remove added columns
 7194        for added_column in added_columns:
 7195            self.drop_column(column=added_column)
 7196
 7197        # Explode INFOS fields into table fields
 7198        if self.get_explode_infos():
 7199            self.explode_infos(
 7200                prefix=self.get_explode_infos_prefix(),
 7201                fields=self.get_explode_infos_fields(),
 7202                force=True,
 7203            )
 7204
 7205        return True
 7206
 7207    ###
 7208    # HGVS
 7209    ###
 7210
 7211    def annotation_hgvs(self, threads: int = None) -> None:
 7212        """
 7213        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7214        coordinates and alleles.
 7215
 7216        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7217        threads to use for parallel processing. If no value is provided, it will default to the number
 7218        of threads obtained from the `get_threads()` method
 7219        :type threads: int
 7220        """
 7221
 7222        # Function for each partition of the Dask Dataframe
 7223        def partition_function(partition):
 7224            """
 7225            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7226            each row of a DataFrame called `partition`.
 7227
 7228            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7229            to be processed
 7230            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7231            the "partition" dataframe along the axis 1.
 7232            """
 7233            return partition.apply(annotation_hgvs_partition, axis=1)
 7234
 7235        def annotation_hgvs_partition(row) -> str:
 7236            """
 7237            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7238            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7239
 7240            :param row: A dictionary-like object that contains the values for the following keys:
 7241            :return: a string that contains the HGVS names associated with the given row of data.
 7242            """
 7243
 7244            chr = row["CHROM"]
 7245            pos = row["POS"]
 7246            ref = row["REF"]
 7247            alt = row["ALT"]
 7248
 7249            # Find list of associated transcripts
 7250            transcripts_list = list(
 7251                polars_conn.execute(
 7252                    f"""
 7253                SELECT transcript
 7254                FROM refseq_df
 7255                WHERE CHROM='{chr}'
 7256                AND POS={pos}
 7257            """
 7258                )["transcript"]
 7259            )
 7260
 7261            # Full HGVS annotation in list
 7262            hgvs_full_list = []
 7263
 7264            for transcript_name in transcripts_list:
 7265
 7266                # Transcript
 7267                transcript = get_transcript(
 7268                    transcripts=transcripts, transcript_name=transcript_name
 7269                )
 7270                # Exon
 7271                if use_exon:
 7272                    exon = transcript.find_exon_number(pos)
 7273                else:
 7274                    exon = None
 7275                # Protein
 7276                transcript_protein = None
 7277                if use_protein or add_protein or full_format:
 7278                    transcripts_protein = list(
 7279                        polars_conn.execute(
 7280                            f"""
 7281                        SELECT protein
 7282                        FROM refseqlink_df
 7283                        WHERE transcript='{transcript_name}'
 7284                        LIMIT 1
 7285                    """
 7286                        )["protein"]
 7287                    )
 7288                    if len(transcripts_protein):
 7289                        transcript_protein = transcripts_protein[0]
 7290
 7291                # HGVS name
 7292                hgvs_name = format_hgvs_name(
 7293                    chr,
 7294                    pos,
 7295                    ref,
 7296                    alt,
 7297                    genome=genome,
 7298                    transcript=transcript,
 7299                    transcript_protein=transcript_protein,
 7300                    exon=exon,
 7301                    use_gene=use_gene,
 7302                    use_protein=use_protein,
 7303                    full_format=full_format,
 7304                    use_version=use_version,
 7305                    codon_type=codon_type,
 7306                )
 7307                hgvs_full_list.append(hgvs_name)
 7308                if add_protein and not use_protein and not full_format:
 7309                    hgvs_name = format_hgvs_name(
 7310                        chr,
 7311                        pos,
 7312                        ref,
 7313                        alt,
 7314                        genome=genome,
 7315                        transcript=transcript,
 7316                        transcript_protein=transcript_protein,
 7317                        exon=exon,
 7318                        use_gene=use_gene,
 7319                        use_protein=True,
 7320                        full_format=False,
 7321                        use_version=use_version,
 7322                        codon_type=codon_type,
 7323                    )
 7324                    hgvs_full_list.append(hgvs_name)
 7325
 7326            # Create liste of HGVS annotations
 7327            hgvs_full = ",".join(hgvs_full_list)
 7328
 7329            return hgvs_full
 7330
 7331        # Polars connexion
 7332        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7333
 7334        # Config
 7335        config = self.get_config()
 7336
 7337        # Databases
 7338        # Genome
 7339        databases_genomes_folders = (
 7340            config.get("folders", {})
 7341            .get("databases", {})
 7342            .get("genomes", DEFAULT_GENOME_FOLDER)
 7343        )
 7344        databases_genome = (
 7345            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7346        )
 7347        # refseq database folder
 7348        databases_refseq_folders = (
 7349            config.get("folders", {})
 7350            .get("databases", {})
 7351            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7352        )
 7353        # refseq
 7354        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7355        # refSeqLink
 7356        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7357
 7358        # Param
 7359        param = self.get_param()
 7360
 7361        # Quick HGVS
 7362        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7363            log.info(f"Quick HGVS Annotation:")
 7364            if not param.get("hgvs", None):
 7365                param["hgvs"] = {}
 7366            for option in param.get("hgvs_options", "").split(","):
 7367                option_var_val = option.split("=")
 7368                option_var = option_var_val[0]
 7369                if len(option_var_val) > 1:
 7370                    option_val = option_var_val[1]
 7371                else:
 7372                    option_val = "True"
 7373                if option_val.upper() in ["TRUE"]:
 7374                    option_val = True
 7375                elif option_val.upper() in ["FALSE"]:
 7376                    option_val = False
 7377                log.info(f"   {option_var}={option_val}")
 7378                param["hgvs"][option_var] = option_val
 7379
 7380        # Check if HGVS annotation enabled
 7381        if "hgvs" in param:
 7382            log.info(f"HGVS Annotation... ")
 7383            for hgvs_option in param.get("hgvs", {}):
 7384                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7385        else:
 7386            return
 7387
 7388        # HGVS Param
 7389        param_hgvs = param.get("hgvs", {})
 7390        use_exon = param_hgvs.get("use_exon", False)
 7391        use_gene = param_hgvs.get("use_gene", False)
 7392        use_protein = param_hgvs.get("use_protein", False)
 7393        add_protein = param_hgvs.get("add_protein", False)
 7394        full_format = param_hgvs.get("full_format", False)
 7395        use_version = param_hgvs.get("use_version", False)
 7396        codon_type = param_hgvs.get("codon_type", "3")
 7397
 7398        # refSseq refSeqLink
 7399        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 7400        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 7401
 7402        # Assembly
 7403        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 7404
 7405        # Genome
 7406        genome_file = None
 7407        if find_genome(databases_genome):
 7408            genome_file = find_genome(databases_genome)
 7409        else:
 7410            genome_file = find_genome(
 7411                genome_path=databases_genomes_folders, assembly=assembly
 7412            )
 7413        log.debug("Genome: " + str(genome_file))
 7414
 7415        # refSseq
 7416        refseq_file = find_file_prefix(
 7417            input_file=databases_refseq,
 7418            prefix="ncbiRefSeq",
 7419            folder=databases_refseq_folders,
 7420            assembly=assembly,
 7421        )
 7422        log.debug("refSeq: " + str(refseq_file))
 7423
 7424        # refSeqLink
 7425        refseqlink_file = find_file_prefix(
 7426            input_file=databases_refseqlink,
 7427            prefix="ncbiRefSeqLink",
 7428            folder=databases_refseq_folders,
 7429            assembly=assembly,
 7430        )
 7431        log.debug("refSeqLink: " + str(refseqlink_file))
 7432
 7433        # Threads
 7434        if not threads:
 7435            threads = self.get_threads()
 7436        log.debug("Threads: " + str(threads))
 7437
 7438        # Variables
 7439        table_variants = self.get_table_variants(clause="update")
 7440
 7441        # Get variants SNV and InDel only
 7442        query_variants = f"""
 7443            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 7444            FROM {table_variants}
 7445            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 7446            """
 7447        df_variants = self.get_query_to_df(query_variants)
 7448
 7449        # Added columns
 7450        added_columns = []
 7451
 7452        # Add hgvs column in variants table
 7453        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 7454        added_column = self.add_column(
 7455            table_variants, hgvs_column_name, "STRING", default_value=None
 7456        )
 7457        added_columns.append(added_column)
 7458
 7459        log.debug(f"refSeq loading...")
 7460        # refSeq in duckDB
 7461        refseq_table = get_refseq_table(
 7462            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 7463        )
 7464        # Loading all refSeq in Dataframe
 7465        refseq_query = f"""
 7466            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 7467            FROM {refseq_table}
 7468            JOIN df_variants ON (
 7469                {refseq_table}.chrom = df_variants.CHROM
 7470                AND {refseq_table}.txStart<=df_variants.POS
 7471                AND {refseq_table}.txEnd>=df_variants.POS
 7472            )
 7473        """
 7474        refseq_df = self.conn.query(refseq_query).pl()
 7475
 7476        if refseqlink_file:
 7477            log.debug(f"refSeqLink loading...")
 7478            # refSeqLink in duckDB
 7479            refseqlink_table = get_refseq_table(
 7480                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 7481            )
 7482            # Loading all refSeqLink in Dataframe
 7483            protacc_column = "protAcc_with_ver"
 7484            mrnaacc_column = "mrnaAcc_with_ver"
 7485            refseqlink_query = f"""
 7486                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 7487                FROM {refseqlink_table} 
 7488                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 7489                WHERE protAcc_without_ver IS NOT NULL
 7490            """
 7491            # Polars Dataframe
 7492            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 7493
 7494        # Read RefSeq transcripts into a python dict/model.
 7495        log.debug(f"Transcripts loading...")
 7496        with tempfile.TemporaryDirectory() as tmpdir:
 7497            transcripts_query = f"""
 7498                COPY (
 7499                    SELECT {refseq_table}.*
 7500                    FROM {refseq_table}
 7501                    JOIN df_variants ON (
 7502                        {refseq_table}.chrom=df_variants.CHROM
 7503                        AND {refseq_table}.txStart<=df_variants.POS
 7504                        AND {refseq_table}.txEnd>=df_variants.POS
 7505                    )
 7506                )
 7507                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 7508            """
 7509            self.conn.query(transcripts_query)
 7510            with open(f"{tmpdir}/transcript.tsv") as infile:
 7511                transcripts = read_transcripts(infile)
 7512
 7513        # Polars connexion
 7514        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7515
 7516        log.debug("Genome loading...")
 7517        # Read genome sequence using pyfaidx.
 7518        genome = Fasta(genome_file)
 7519
 7520        log.debug("Start annotation HGVS...")
 7521
 7522        # Create
 7523        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 7524        ddf = dd.from_pandas(df_variants, npartitions=threads)
 7525
 7526        # Use dask.dataframe.apply() to apply function on each partition
 7527        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 7528
 7529        # Convert Dask DataFrame to Pandas Dataframe
 7530        df = ddf.compute()
 7531
 7532        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 7533        with tempfile.TemporaryDirectory() as tmpdir:
 7534            df_parquet = os.path.join(tmpdir, "df.parquet")
 7535            df.to_parquet(df_parquet)
 7536
 7537            # Update hgvs column
 7538            update_variant_query = f"""
 7539                UPDATE {table_variants}
 7540                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 7541                FROM read_parquet('{df_parquet}') as df
 7542                WHERE variants."#CHROM" = df.CHROM
 7543                AND variants.POS = df.POS
 7544                AND variants.REF = df.REF
 7545                AND variants.ALT = df.ALT
 7546                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 7547                """
 7548            self.execute_query(update_variant_query)
 7549
 7550        # Update INFO column
 7551        sql_query_update = f"""
 7552            UPDATE {table_variants}
 7553            SET INFO = 
 7554                concat(
 7555                    CASE 
 7556                        WHEN INFO NOT IN ('','.')
 7557                        THEN concat(INFO, ';')
 7558                        ELSE ''
 7559                    END,
 7560                    'hgvs=',
 7561                    {hgvs_column_name}
 7562                )
 7563            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 7564            """
 7565        self.execute_query(sql_query_update)
 7566
 7567        # Add header
 7568        HGVS_INFOS = {
 7569            "hgvs": {
 7570                "ID": "hgvs",
 7571                "Number": ".",
 7572                "Type": "String",
 7573                "Description": f"HGVS annotatation with HOWARD",
 7574            }
 7575        }
 7576
 7577        for field in HGVS_INFOS:
 7578            field_ID = HGVS_INFOS[field]["ID"]
 7579            field_description = HGVS_INFOS[field]["Description"]
 7580            self.get_header().infos[field_ID] = vcf.parser._Info(
 7581                field_ID,
 7582                HGVS_INFOS[field]["Number"],
 7583                HGVS_INFOS[field]["Type"],
 7584                field_description,
 7585                "unknown",
 7586                "unknown",
 7587                code_type_map[HGVS_INFOS[field]["Type"]],
 7588            )
 7589
 7590        # Remove added columns
 7591        for added_column in added_columns:
 7592            self.drop_column(column=added_column)
 7593
 7594    ###
 7595    # Calculation
 7596    ###
 7597
 7598    def get_operations_help(
 7599        self, operations_config_dict: dict = {}, operations_config_file: str = None
 7600    ) -> list:
 7601
 7602        # Init
 7603        operations_help = []
 7604
 7605        # operations
 7606        operations = self.get_config_json(
 7607            name="calculations",
 7608            config_dict=operations_config_dict,
 7609            config_file=operations_config_file,
 7610        )
 7611        for op in operations:
 7612            op_name = operations[op].get("name", op).upper()
 7613            op_description = operations[op].get("description", op_name)
 7614            op_available = operations[op].get("available", False)
 7615            if op_available:
 7616                operations_help.append(f"   {op_name}: {op_description}")
 7617
 7618        # Sort operations
 7619        operations_help.sort()
 7620
 7621        # insert header
 7622        operations_help.insert(0, "Available calculation operations:")
 7623
 7624        # Return
 7625        return operations_help
 7626
 7627    def calculation(
 7628        self,
 7629        operations: dict = {},
 7630        operations_config_dict: dict = {},
 7631        operations_config_file: str = None,
 7632    ) -> None:
 7633        """
 7634        It takes a list of operations, and for each operation, it checks if it's a python or sql
 7635        operation, and then calls the appropriate function
 7636
 7637        param json example:
 7638            "calculation": {
 7639                "NOMEN": {
 7640                    "options": {
 7641                        "hgvs_field": "hgvs"
 7642                    },
 7643                "middle" : null
 7644            }
 7645        """
 7646
 7647        # Param
 7648        param = self.get_param()
 7649
 7650        # operations config
 7651        operations_config = self.get_config_json(
 7652            name="calculations",
 7653            config_dict=operations_config_dict,
 7654            config_file=operations_config_file,
 7655        )
 7656
 7657        # Upper keys
 7658        operations_config = {k.upper(): v for k, v in operations_config.items()}
 7659
 7660        # Calculations
 7661
 7662        # Operations from param
 7663        operations = param.get("calculation", {}).get("calculations", operations)
 7664
 7665        # Quick calculation - add
 7666        if param.get("calculations", None):
 7667            calculations_list = [
 7668                value for value in param.get("calculations", "").split(",")
 7669            ]
 7670            log.info(f"Quick Calculations:")
 7671            for calculation_key in calculations_list:
 7672                log.info(f"   {calculation_key}")
 7673            for calculation_operation in calculations_list:
 7674                if calculation_operation.upper() not in operations:
 7675                    operations[calculation_operation.upper()] = {}
 7676                    add_value_into_dict(
 7677                        dict_tree=param,
 7678                        sections=[
 7679                            "calculation",
 7680                            "calculations",
 7681                            calculation_operation.upper(),
 7682                        ],
 7683                        value={},
 7684                    )
 7685
 7686        # Operations for calculation
 7687        if not operations:
 7688            operations = param.get("calculation", {}).get("calculations", {})
 7689
 7690        if operations:
 7691            log.info(f"Calculations...")
 7692
 7693        # For each operations
 7694        for operation_name in operations:
 7695            operation_name = operation_name.upper()
 7696            if operation_name not in [""]:
 7697                if operation_name in operations_config:
 7698                    log.info(f"Calculation '{operation_name}'")
 7699                    operation = operations_config[operation_name]
 7700                    operation_type = operation.get("type", "sql")
 7701                    if operation_type == "python":
 7702                        self.calculation_process_function(
 7703                            operation=operation, operation_name=operation_name
 7704                        )
 7705                    elif operation_type == "sql":
 7706                        self.calculation_process_sql(
 7707                            operation=operation, operation_name=operation_name
 7708                        )
 7709                    else:
 7710                        log.error(
 7711                            f"Operations config: Type '{operation_type}' NOT available"
 7712                        )
 7713                        raise ValueError(
 7714                            f"Operations config: Type '{operation_type}' NOT available"
 7715                        )
 7716                else:
 7717                    log.error(
 7718                        f"Operations config: Calculation '{operation_name}' NOT available"
 7719                    )
 7720                    raise ValueError(
 7721                        f"Operations config: Calculation '{operation_name}' NOT available"
 7722                    )
 7723
 7724        # Explode INFOS fields into table fields
 7725        if self.get_explode_infos():
 7726            self.explode_infos(
 7727                prefix=self.get_explode_infos_prefix(),
 7728                fields=self.get_explode_infos_fields(),
 7729                force=True,
 7730            )
 7731
 7732    def calculation_process_sql(
 7733        self, operation: dict, operation_name: str = "unknown"
 7734    ) -> None:
 7735        """
 7736        The `calculation_process_sql` function takes in a mathematical operation as a string and
 7737        performs the operation, updating the specified table with the result.
 7738
 7739        :param operation: The `operation` parameter is a dictionary that contains information about the
 7740        mathematical operation to be performed. It includes the following keys:
 7741        :type operation: dict
 7742        :param operation_name: The `operation_name` parameter is a string that represents the name of
 7743        the mathematical operation being performed. It is used for logging and error handling purposes,
 7744        defaults to unknown
 7745        :type operation_name: str (optional)
 7746        """
 7747
 7748        # table variants
 7749        table_variants = self.get_table_variants(clause="alter")
 7750
 7751        # Operation infos
 7752        operation_name = operation.get("name", "unknown")
 7753        log.debug(f"process sql {operation_name}")
 7754        output_column_name = operation.get("output_column_name", operation_name)
 7755        output_column_type = operation.get("output_column_type", "String")
 7756        prefix = operation.get("explode_infos_prefix", "")
 7757        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 7758        output_column_description = operation.get(
 7759            "output_column_description", f"{operation_name} operation"
 7760        )
 7761        operation_query = operation.get("operation_query", None)
 7762        if isinstance(operation_query, list):
 7763            operation_query = " ".join(operation_query)
 7764        operation_info_fields = operation.get("info_fields", [])
 7765        operation_info_fields_check = operation.get("info_fields_check", False)
 7766        operation_info = operation.get("operation_info", True)
 7767
 7768        if operation_query:
 7769
 7770            # Info fields check
 7771            operation_info_fields_check_result = True
 7772            if operation_info_fields_check:
 7773                header_infos = self.get_header().infos
 7774                for info_field in operation_info_fields:
 7775                    operation_info_fields_check_result = (
 7776                        operation_info_fields_check_result
 7777                        and info_field in header_infos
 7778                    )
 7779
 7780            # If info fields available
 7781            if operation_info_fields_check_result:
 7782
 7783                # Added_columns
 7784                added_columns = []
 7785
 7786                # Create VCF header field
 7787                vcf_reader = self.get_header()
 7788                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 7789                    output_column_name,
 7790                    ".",
 7791                    output_column_type,
 7792                    output_column_description,
 7793                    "howard calculation",
 7794                    "0",
 7795                    self.code_type_map.get(output_column_type),
 7796                )
 7797
 7798                # Explode infos if needed
 7799                log.debug(f"calculation_process_sql prefix {prefix}")
 7800                added_columns += self.explode_infos(
 7801                    prefix=prefix,
 7802                    fields=[output_column_name] + operation_info_fields,
 7803                    force=True,
 7804                )
 7805
 7806                # Create column
 7807                added_column = self.add_column(
 7808                    table_name=table_variants,
 7809                    column_name=prefix + output_column_name,
 7810                    column_type=output_column_type_sql,
 7811                    default_value="null",
 7812                )
 7813                added_columns.append(added_column)
 7814
 7815                # Operation calculation
 7816                try:
 7817
 7818                    # Query to update calculation column
 7819                    sql_update = f"""
 7820                        UPDATE {table_variants}
 7821                        SET "{prefix}{output_column_name}" = ({operation_query})
 7822                    """
 7823                    self.conn.execute(sql_update)
 7824
 7825                    # Add to INFO
 7826                    if operation_info:
 7827                        sql_update_info = f"""
 7828                            UPDATE {table_variants}
 7829                            SET "INFO" =
 7830                                concat(
 7831                                    CASE
 7832                                        WHEN "INFO" IS NOT NULL
 7833                                        THEN concat("INFO", ';')
 7834                                        ELSE ''
 7835                                    END,
 7836                                    '{output_column_name}=',
 7837                                    "{prefix}{output_column_name}"
 7838                                )
 7839                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 7840                        """
 7841                        self.conn.execute(sql_update_info)
 7842
 7843                except:
 7844                    log.error(
 7845                        f"Operations config: Calculation '{operation_name}' query failed"
 7846                    )
 7847                    raise ValueError(
 7848                        f"Operations config: Calculation '{operation_name}' query failed"
 7849                    )
 7850
 7851                # Remove added columns
 7852                for added_column in added_columns:
 7853                    log.debug(f"added_column: {added_column}")
 7854                    self.drop_column(column=added_column)
 7855
 7856            else:
 7857                log.error(
 7858                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 7859                )
 7860                raise ValueError(
 7861                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 7862                )
 7863
 7864        else:
 7865            log.error(
 7866                f"Operations config: Calculation '{operation_name}' query NOT defined"
 7867            )
 7868            raise ValueError(
 7869                f"Operations config: Calculation '{operation_name}' query NOT defined"
 7870            )
 7871
 7872    def calculation_process_function(
 7873        self, operation: dict, operation_name: str = "unknown"
 7874    ) -> None:
 7875        """
 7876        The `calculation_process_function` takes in an operation dictionary and performs the specified
 7877        function with the given parameters.
 7878
 7879        :param operation: The `operation` parameter is a dictionary that contains information about the
 7880        operation to be performed. It has the following keys:
 7881        :type operation: dict
 7882        :param operation_name: The `operation_name` parameter is a string that represents the name of
 7883        the operation being performed. It is used for logging purposes, defaults to unknown
 7884        :type operation_name: str (optional)
 7885        """
 7886
 7887        operation_name = operation["name"]
 7888        log.debug(f"process sql {operation_name}")
 7889        function_name = operation["function_name"]
 7890        function_params = operation["function_params"]
 7891        getattr(self, function_name)(*function_params)
 7892
 7893    def calculation_variant_id(self) -> None:
 7894        """
 7895        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 7896        updates the INFO field of a variants table with the variant ID.
 7897        """
 7898
 7899        # variant_id annotation field
 7900        variant_id_tag = self.get_variant_id_column()
 7901        added_columns = [variant_id_tag]
 7902
 7903        # variant_id hgvs tags"
 7904        vcf_infos_tags = {
 7905            variant_id_tag: "howard variant ID annotation",
 7906        }
 7907
 7908        # Variants table
 7909        table_variants = self.get_table_variants()
 7910
 7911        # Header
 7912        vcf_reader = self.get_header()
 7913
 7914        # Add variant_id to header
 7915        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 7916            variant_id_tag,
 7917            ".",
 7918            "String",
 7919            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 7920            "howard calculation",
 7921            "0",
 7922            self.code_type_map.get("String"),
 7923        )
 7924
 7925        # Update
 7926        sql_update = f"""
 7927            UPDATE {table_variants}
 7928            SET "INFO" = 
 7929                concat(
 7930                    CASE
 7931                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 7932                        THEN ''
 7933                        ELSE concat("INFO", ';')
 7934                    END,
 7935                    '{variant_id_tag}=',
 7936                    "{variant_id_tag}"
 7937                )
 7938        """
 7939        self.conn.execute(sql_update)
 7940
 7941        # Remove added columns
 7942        for added_column in added_columns:
 7943            self.drop_column(column=added_column)
 7944
 7945    def calculation_extract_snpeff_hgvs(
 7946        self,
 7947        snpeff_hgvs: str = "snpeff_hgvs",
 7948        snpeff_field: str = "ANN",
 7949    ) -> None:
 7950        """
 7951        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 7952        annotation field in a VCF file and adds them as a new column in the variants table.
 7953
 7954        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 7955        function is used to specify the name of the column that will store the HGVS nomenclatures
 7956        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 7957        snpeff_hgvs
 7958        :type snpeff_hgvs: str (optional)
 7959        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 7960        function represents the field in the VCF file that contains SnpEff annotations. This field is
 7961        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 7962        to ANN
 7963        :type snpeff_field: str (optional)
 7964        """
 7965
 7966        # Snpeff hgvs tags
 7967        vcf_infos_tags = {
 7968            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 7969        }
 7970
 7971        # Prefix
 7972        prefix = self.get_explode_infos_prefix()
 7973        if prefix:
 7974            prefix = "INFO/"
 7975
 7976        # snpEff fields
 7977        speff_ann_infos = prefix + snpeff_field
 7978        speff_hgvs_infos = prefix + snpeff_hgvs
 7979
 7980        # Variants table
 7981        table_variants = self.get_table_variants()
 7982
 7983        # Header
 7984        vcf_reader = self.get_header()
 7985
 7986        # Add columns
 7987        added_columns = []
 7988
 7989        # Explode HGVS field in column
 7990        added_columns += self.explode_infos(fields=[snpeff_field])
 7991
 7992        if snpeff_field in vcf_reader.infos:
 7993
 7994            log.debug(vcf_reader.infos[snpeff_field])
 7995
 7996            # Extract ANN header
 7997            ann_description = vcf_reader.infos[snpeff_field].desc
 7998            pattern = r"'(.+?)'"
 7999            match = re.search(pattern, ann_description)
 8000            if match:
 8001                ann_header_match = match.group(1).split(" | ")
 8002                ann_header_desc = {}
 8003                for i in range(len(ann_header_match)):
 8004                    ann_header_info = "".join(
 8005                        char for char in ann_header_match[i] if char.isalnum()
 8006                    )
 8007                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8008                if not ann_header_desc:
 8009                    raise ValueError("Invalid header description format")
 8010            else:
 8011                raise ValueError("Invalid header description format")
 8012
 8013            # Create variant id
 8014            variant_id_column = self.get_variant_id_column()
 8015            added_columns += [variant_id_column]
 8016
 8017            # Create dataframe
 8018            dataframe_snpeff_hgvs = self.get_query_to_df(
 8019                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8020            )
 8021
 8022            # Create main NOMEN column
 8023            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8024                speff_ann_infos
 8025            ].apply(
 8026                lambda x: extract_snpeff_hgvs(
 8027                    str(x), header=list(ann_header_desc.values())
 8028                )
 8029            )
 8030
 8031            # Add snpeff_hgvs to header
 8032            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8033                snpeff_hgvs,
 8034                ".",
 8035                "String",
 8036                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8037                "howard calculation",
 8038                "0",
 8039                self.code_type_map.get("String"),
 8040            )
 8041
 8042            # Update
 8043            sql_update = f"""
 8044                UPDATE variants
 8045                SET "INFO" = 
 8046                    concat(
 8047                        CASE
 8048                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8049                            THEN ''
 8050                            ELSE concat("INFO", ';')
 8051                        END,
 8052                        CASE 
 8053                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8054                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8055                            THEN concat(
 8056                                    '{snpeff_hgvs}=',
 8057                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8058                                )
 8059                            ELSE ''
 8060                        END
 8061                    )
 8062                FROM dataframe_snpeff_hgvs
 8063                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8064
 8065            """
 8066            self.conn.execute(sql_update)
 8067
 8068            # Delete dataframe
 8069            del dataframe_snpeff_hgvs
 8070            gc.collect()
 8071
 8072        else:
 8073
 8074            log.warning(
 8075                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8076            )
 8077
 8078        # Remove added columns
 8079        for added_column in added_columns:
 8080            self.drop_column(column=added_column)
 8081
 8082    def calculation_snpeff_ann_explode(
 8083        self,
 8084        uniquify: bool = True,
 8085        output_format: str = "fields",
 8086        output_prefix: str = "snpeff_",
 8087        snpeff_field: str = "ANN",
 8088    ) -> None:
 8089        """
 8090        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8091        exploding the HGVS field and updating variant information accordingly.
 8092
 8093        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8094        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8095        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8096        defaults to True
 8097        :type uniquify: bool (optional)
 8098        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8099        function specifies the format in which the output annotations will be generated. It has a
 8100        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8101        format, defaults to fields
 8102        :type output_format: str (optional)
 8103        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8104        method is used to specify the prefix that will be added to the output annotations generated
 8105        during the calculation process. This prefix helps to differentiate the newly added annotations
 8106        from existing ones in the output data. By default, the, defaults to ANN_
 8107        :type output_prefix: str (optional)
 8108        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8109        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8110        field will be processed to explode the HGVS annotations and update the variant information
 8111        accordingly, defaults to ANN
 8112        :type snpeff_field: str (optional)
 8113        """
 8114
 8115        # SnpEff annotation field
 8116        snpeff_hgvs = "snpeff_ann_explode"
 8117
 8118        # Snpeff hgvs tags
 8119        vcf_infos_tags = {
 8120            snpeff_hgvs: "Explode snpEff annotations",
 8121        }
 8122
 8123        # Prefix
 8124        prefix = self.get_explode_infos_prefix()
 8125        if prefix:
 8126            prefix = "INFO/"
 8127
 8128        # snpEff fields
 8129        speff_ann_infos = prefix + snpeff_field
 8130        speff_hgvs_infos = prefix + snpeff_hgvs
 8131
 8132        # Variants table
 8133        table_variants = self.get_table_variants()
 8134
 8135        # Header
 8136        vcf_reader = self.get_header()
 8137
 8138        # Add columns
 8139        added_columns = []
 8140
 8141        # Explode HGVS field in column
 8142        added_columns += self.explode_infos(fields=[snpeff_field])
 8143        log.debug(f"snpeff_field={snpeff_field}")
 8144        log.debug(f"added_columns={added_columns}")
 8145
 8146        if snpeff_field in vcf_reader.infos:
 8147
 8148            # Extract ANN header
 8149            ann_description = vcf_reader.infos[snpeff_field].desc
 8150            pattern = r"'(.+?)'"
 8151            match = re.search(pattern, ann_description)
 8152            if match:
 8153                ann_header_match = match.group(1).split(" | ")
 8154                ann_header = []
 8155                ann_header_desc = {}
 8156                for i in range(len(ann_header_match)):
 8157                    ann_header_info = "".join(
 8158                        char for char in ann_header_match[i] if char.isalnum()
 8159                    )
 8160                    ann_header.append(ann_header_info)
 8161                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8162                if not ann_header_desc:
 8163                    raise ValueError("Invalid header description format")
 8164            else:
 8165                raise ValueError("Invalid header description format")
 8166
 8167            # Create variant id
 8168            variant_id_column = self.get_variant_id_column()
 8169            added_columns += [variant_id_column]
 8170
 8171            # Create dataframe
 8172            dataframe_snpeff_hgvs = self.get_query_to_df(
 8173                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8174            )
 8175
 8176            # Create snpEff columns
 8177            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8178                speff_ann_infos
 8179            ].apply(
 8180                lambda x: explode_snpeff_ann(
 8181                    str(x),
 8182                    uniquify=uniquify,
 8183                    output_format=output_format,
 8184                    prefix=output_prefix,
 8185                    header=list(ann_header_desc.values()),
 8186                )
 8187            )
 8188
 8189            # Header
 8190            ann_annotations_prefix = ""
 8191            if output_format.upper() in ["JSON"]:
 8192                ann_annotations_prefix = f"{output_prefix}="
 8193                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8194                    output_prefix,
 8195                    ".",
 8196                    "String",
 8197                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8198                    + " - JSON format",
 8199                    "howard calculation",
 8200                    "0",
 8201                    self.code_type_map.get("String"),
 8202                )
 8203            else:
 8204                for ann_annotation in ann_header:
 8205                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8206                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8207                        ann_annotation_id,
 8208                        ".",
 8209                        "String",
 8210                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8211                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8212                        "howard calculation",
 8213                        "0",
 8214                        self.code_type_map.get("String"),
 8215                    )
 8216
 8217            # Update
 8218            sql_update = f"""
 8219                UPDATE variants
 8220                SET "INFO" = 
 8221                    concat(
 8222                        CASE
 8223                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8224                            THEN ''
 8225                            ELSE concat("INFO", ';')
 8226                        END,
 8227                        CASE 
 8228                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8229                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8230                            THEN concat(
 8231                                '{ann_annotations_prefix}',
 8232                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8233                                )
 8234                            ELSE ''
 8235                        END
 8236                    )
 8237                FROM dataframe_snpeff_hgvs
 8238                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8239
 8240            """
 8241            self.conn.execute(sql_update)
 8242
 8243            # Delete dataframe
 8244            del dataframe_snpeff_hgvs
 8245            gc.collect()
 8246
 8247        else:
 8248
 8249            log.warning(
 8250                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8251            )
 8252
 8253        # Remove added columns
 8254        for added_column in added_columns:
 8255            self.drop_column(column=added_column)
 8256
 8257    def calculation_extract_nomen(self) -> None:
 8258        """
 8259        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8260        """
 8261
 8262        # NOMEN field
 8263        field_nomen_dict = "NOMEN_DICT"
 8264
 8265        # NOMEN structure
 8266        nomen_dict = {
 8267            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8268            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8269            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8270            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8271            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8272            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8273            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8274            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8275            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8276            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8277        }
 8278
 8279        # Param
 8280        param = self.get_param()
 8281
 8282        # Prefix
 8283        prefix = self.get_explode_infos_prefix()
 8284
 8285        # Header
 8286        vcf_reader = self.get_header()
 8287
 8288        # Get HGVS field
 8289        hgvs_field = (
 8290            param.get("calculation", {})
 8291            .get("calculations", {})
 8292            .get("NOMEN", {})
 8293            .get("options", {})
 8294            .get("hgvs_field", "hgvs")
 8295        )
 8296
 8297        # Get transcripts
 8298        transcripts_file = (
 8299            param.get("calculation", {})
 8300            .get("calculations", {})
 8301            .get("NOMEN", {})
 8302            .get("options", {})
 8303            .get("transcripts", None)
 8304        )
 8305        transcripts_file = full_path(transcripts_file)
 8306        transcripts = []
 8307        if transcripts_file:
 8308            if os.path.exists(transcripts_file):
 8309                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8310                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
 8311            else:
 8312                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
 8313                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
 8314
 8315        # Added columns
 8316        added_columns = []
 8317
 8318        # Explode HGVS field in column
 8319        added_columns += self.explode_infos(fields=[hgvs_field])
 8320
 8321        # extra infos
 8322        extra_infos = self.get_extra_infos()
 8323        extra_field = prefix + hgvs_field
 8324
 8325        if extra_field in extra_infos:
 8326
 8327            # Create dataframe
 8328            dataframe_hgvs = self.get_query_to_df(
 8329                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
 8330            )
 8331
 8332            # Create main NOMEN column
 8333            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
 8334                lambda x: find_nomen(str(x), transcripts=transcripts)
 8335            )
 8336
 8337            # Explode NOMEN Structure and create SQL set for update
 8338            sql_nomen_fields = []
 8339            for nomen_field in nomen_dict:
 8340
 8341                # Explode each field into a column
 8342                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
 8343                    lambda x: dict(x).get(nomen_field, "")
 8344                )
 8345
 8346                # Create VCF header field
 8347                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 8348                    nomen_field,
 8349                    ".",
 8350                    "String",
 8351                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 8352                    "howard calculation",
 8353                    "0",
 8354                    self.code_type_map.get("String"),
 8355                )
 8356                sql_nomen_fields.append(
 8357                    f"""
 8358                        CASE 
 8359                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
 8360                            THEN concat(
 8361                                    ';{nomen_field}=',
 8362                                    dataframe_hgvs."{nomen_field}"
 8363                                )
 8364                            ELSE ''
 8365                        END
 8366                    """
 8367                )
 8368
 8369            # SQL set for update
 8370            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 8371
 8372            # Update
 8373            sql_update = f"""
 8374                UPDATE variants
 8375                SET "INFO" = 
 8376                    concat(
 8377                        CASE
 8378                            WHEN "INFO" IS NULL
 8379                            THEN ''
 8380                            ELSE "INFO"
 8381                        END,
 8382                        {sql_nomen_fields_set}
 8383                    )
 8384                FROM dataframe_hgvs
 8385                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 8386                    AND variants."POS" = dataframe_hgvs."POS" 
 8387                    AND variants."REF" = dataframe_hgvs."REF"
 8388                    AND variants."ALT" = dataframe_hgvs."ALT"
 8389            """
 8390            self.conn.execute(sql_update)
 8391
 8392            # Delete dataframe
 8393            del dataframe_hgvs
 8394            gc.collect()
 8395
 8396        # Remove added columns
 8397        for added_column in added_columns:
 8398            self.drop_column(column=added_column)
 8399
 8400    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 8401        """
 8402        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 8403        pipeline/sample for a variant and updates the variant information in a VCF file.
 8404
 8405        :param tag: The `tag` parameter is a string that represents the annotation field for the
 8406        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 8407        VCF header and to update the corresponding field in the variants table, defaults to
 8408        findbypipeline
 8409        :type tag: str (optional)
 8410        """
 8411
 8412        # if FORMAT and samples
 8413        if (
 8414            "FORMAT" in self.get_header_columns_as_list()
 8415            and self.get_header_sample_list()
 8416        ):
 8417
 8418            # findbypipeline annotation field
 8419            findbypipeline_tag = tag
 8420
 8421            # VCF infos tags
 8422            vcf_infos_tags = {
 8423                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 8424            }
 8425
 8426            # Prefix
 8427            prefix = self.get_explode_infos_prefix()
 8428
 8429            # Field
 8430            findbypipeline_infos = prefix + findbypipeline_tag
 8431
 8432            # Variants table
 8433            table_variants = self.get_table_variants()
 8434
 8435            # Header
 8436            vcf_reader = self.get_header()
 8437
 8438            # Create variant id
 8439            variant_id_column = self.get_variant_id_column()
 8440            added_columns = [variant_id_column]
 8441
 8442            # variant_id, FORMAT and samples
 8443            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8444                self.get_header_sample_list()
 8445            )
 8446
 8447            # Create dataframe
 8448            dataframe_findbypipeline = self.get_query_to_df(
 8449                f""" SELECT {samples_fields} FROM {table_variants} """
 8450            )
 8451
 8452            # Create findbypipeline column
 8453            dataframe_findbypipeline[findbypipeline_infos] = (
 8454                dataframe_findbypipeline.apply(
 8455                    lambda row: findbypipeline(
 8456                        row, samples=self.get_header_sample_list()
 8457                    ),
 8458                    axis=1,
 8459                )
 8460            )
 8461
 8462            # Add snpeff_hgvs to header
 8463            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 8464                findbypipeline_tag,
 8465                ".",
 8466                "String",
 8467                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 8468                "howard calculation",
 8469                "0",
 8470                self.code_type_map.get("String"),
 8471            )
 8472
 8473            # Update
 8474            sql_update = f"""
 8475                UPDATE variants
 8476                SET "INFO" = 
 8477                    concat(
 8478                        CASE
 8479                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8480                            THEN ''
 8481                            ELSE concat("INFO", ';')
 8482                        END,
 8483                        CASE 
 8484                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 8485                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 8486                            THEN concat(
 8487                                    '{findbypipeline_tag}=',
 8488                                    dataframe_findbypipeline."{findbypipeline_infos}"
 8489                                )
 8490                            ELSE ''
 8491                        END
 8492                    )
 8493                FROM dataframe_findbypipeline
 8494                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 8495            """
 8496            self.conn.execute(sql_update)
 8497
 8498            # Remove added columns
 8499            for added_column in added_columns:
 8500                self.drop_column(column=added_column)
 8501
 8502            # Delete dataframe
 8503            del dataframe_findbypipeline
 8504            gc.collect()
 8505
 8506    def calculation_genotype_concordance(self) -> None:
 8507        """
 8508        The function `calculation_genotype_concordance` calculates the genotype concordance for
 8509        multi-caller VCF files and updates the variant information in the database.
 8510        """
 8511
 8512        # if FORMAT and samples
 8513        if (
 8514            "FORMAT" in self.get_header_columns_as_list()
 8515            and self.get_header_sample_list()
 8516        ):
 8517
 8518            # genotypeconcordance annotation field
 8519            genotypeconcordance_tag = "genotypeconcordance"
 8520
 8521            # VCF infos tags
 8522            vcf_infos_tags = {
 8523                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 8524            }
 8525
 8526            # Prefix
 8527            prefix = self.get_explode_infos_prefix()
 8528
 8529            # Field
 8530            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 8531
 8532            # Variants table
 8533            table_variants = self.get_table_variants()
 8534
 8535            # Header
 8536            vcf_reader = self.get_header()
 8537
 8538            # Create variant id
 8539            variant_id_column = self.get_variant_id_column()
 8540            added_columns = [variant_id_column]
 8541
 8542            # variant_id, FORMAT and samples
 8543            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8544                self.get_header_sample_list()
 8545            )
 8546
 8547            # Create dataframe
 8548            dataframe_genotypeconcordance = self.get_query_to_df(
 8549                f""" SELECT {samples_fields} FROM {table_variants} """
 8550            )
 8551
 8552            # Create genotypeconcordance column
 8553            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 8554                dataframe_genotypeconcordance.apply(
 8555                    lambda row: genotypeconcordance(
 8556                        row, samples=self.get_header_sample_list()
 8557                    ),
 8558                    axis=1,
 8559                )
 8560            )
 8561
 8562            # Add genotypeconcordance to header
 8563            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 8564                genotypeconcordance_tag,
 8565                ".",
 8566                "String",
 8567                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 8568                "howard calculation",
 8569                "0",
 8570                self.code_type_map.get("String"),
 8571            )
 8572
 8573            # Update
 8574            sql_update = f"""
 8575                UPDATE variants
 8576                SET "INFO" = 
 8577                    concat(
 8578                        CASE
 8579                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8580                            THEN ''
 8581                            ELSE concat("INFO", ';')
 8582                        END,
 8583                        CASE
 8584                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 8585                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 8586                            THEN concat(
 8587                                    '{genotypeconcordance_tag}=',
 8588                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 8589                                )
 8590                            ELSE ''
 8591                        END
 8592                    )
 8593                FROM dataframe_genotypeconcordance
 8594                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 8595            """
 8596            self.conn.execute(sql_update)
 8597
 8598            # Remove added columns
 8599            for added_column in added_columns:
 8600                self.drop_column(column=added_column)
 8601
 8602            # Delete dataframe
 8603            del dataframe_genotypeconcordance
 8604            gc.collect()
 8605
 8606    def calculation_barcode(self, tag: str = "barcode") -> None:
 8607        """
 8608        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 8609        updates the INFO field in the file with the calculated barcode values.
 8610
 8611        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 8612        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 8613        the default tag name is set to "barcode", defaults to barcode
 8614        :type tag: str (optional)
 8615        """
 8616
 8617        # if FORMAT and samples
 8618        if (
 8619            "FORMAT" in self.get_header_columns_as_list()
 8620            and self.get_header_sample_list()
 8621        ):
 8622
 8623            # barcode annotation field
 8624            if not tag:
 8625                tag = "barcode"
 8626
 8627            # VCF infos tags
 8628            vcf_infos_tags = {
 8629                tag: "barcode calculation (VaRank)",
 8630            }
 8631
 8632            # Prefix
 8633            prefix = self.get_explode_infos_prefix()
 8634
 8635            # Field
 8636            barcode_infos = prefix + tag
 8637
 8638            # Variants table
 8639            table_variants = self.get_table_variants()
 8640
 8641            # Header
 8642            vcf_reader = self.get_header()
 8643
 8644            # Create variant id
 8645            variant_id_column = self.get_variant_id_column()
 8646            added_columns = [variant_id_column]
 8647
 8648            # variant_id, FORMAT and samples
 8649            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8650                self.get_header_sample_list()
 8651            )
 8652
 8653            # Create dataframe
 8654            dataframe_barcode = self.get_query_to_df(
 8655                f""" SELECT {samples_fields} FROM {table_variants} """
 8656            )
 8657
 8658            # Create barcode column
 8659            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 8660                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 8661            )
 8662
 8663            # Add barcode to header
 8664            vcf_reader.infos[tag] = vcf.parser._Info(
 8665                tag,
 8666                ".",
 8667                "String",
 8668                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 8669                "howard calculation",
 8670                "0",
 8671                self.code_type_map.get("String"),
 8672            )
 8673
 8674            # Update
 8675            sql_update = f"""
 8676                UPDATE {table_variants}
 8677                SET "INFO" = 
 8678                    concat(
 8679                        CASE
 8680                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8681                            THEN ''
 8682                            ELSE concat("INFO", ';')
 8683                        END,
 8684                        CASE
 8685                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 8686                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 8687                            THEN concat(
 8688                                    '{tag}=',
 8689                                    dataframe_barcode."{barcode_infos}"
 8690                                )
 8691                            ELSE ''
 8692                        END
 8693                    )
 8694                FROM dataframe_barcode
 8695                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 8696            """
 8697            self.conn.execute(sql_update)
 8698
 8699            # Remove added columns
 8700            for added_column in added_columns:
 8701                self.drop_column(column=added_column)
 8702
 8703            # Delete dataframe
 8704            del dataframe_barcode
 8705            gc.collect()
 8706
 8707    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 8708        """
 8709        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 8710        and updates the INFO field in the file with the calculated barcode values.
 8711
 8712        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 8713        the barcode tag that will be added to the VCF file during the calculation process. If no value
 8714        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 8715        :type tag: str (optional)
 8716        """
 8717
 8718        # if FORMAT and samples
 8719        if (
 8720            "FORMAT" in self.get_header_columns_as_list()
 8721            and self.get_header_sample_list()
 8722        ):
 8723
 8724            # barcode annotation field
 8725            if not tag:
 8726                tag = "BCF"
 8727
 8728            # VCF infos tags
 8729            vcf_infos_tags = {
 8730                tag: "barcode family calculation",
 8731                f"{tag}S": "barcode family samples",
 8732            }
 8733
 8734            # Param
 8735            param = self.get_param()
 8736            log.debug(f"param={param}")
 8737
 8738            # Prefix
 8739            prefix = self.get_explode_infos_prefix()
 8740
 8741            # PED param
 8742            ped = (
 8743                param.get("calculation", {})
 8744                .get("calculations", {})
 8745                .get("BARCODEFAMILY", {})
 8746                .get("family_pedigree", None)
 8747            )
 8748            log.debug(f"ped={ped}")
 8749
 8750            # Load PED
 8751            if ped:
 8752
 8753                # Pedigree is a file
 8754                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 8755                    log.debug("Pedigree is file")
 8756                    with open(full_path(ped)) as ped:
 8757                        ped = json.load(ped)
 8758
 8759                # Pedigree is a string
 8760                elif isinstance(ped, str):
 8761                    log.debug("Pedigree is str")
 8762                    try:
 8763                        ped = json.loads(ped)
 8764                        log.debug("Pedigree is json str")
 8765                    except ValueError as e:
 8766                        ped_samples = ped.split(",")
 8767                        ped = {}
 8768                        for ped_sample in ped_samples:
 8769                            ped[ped_sample] = ped_sample
 8770
 8771                # Pedigree is a dict
 8772                elif isinstance(ped, dict):
 8773                    log.debug("Pedigree is dict")
 8774
 8775                # Pedigree is not well formatted
 8776                else:
 8777                    msg_error = "Pedigree not well formatted"
 8778                    log.error(msg_error)
 8779                    raise ValueError(msg_error)
 8780
 8781                # Construct list
 8782                ped_samples = list(ped.values())
 8783
 8784            else:
 8785                log.debug("Pedigree not defined. Take all samples")
 8786                ped_samples = self.get_header_sample_list()
 8787                ped = {}
 8788                for ped_sample in ped_samples:
 8789                    ped[ped_sample] = ped_sample
 8790
 8791            # Check pedigree
 8792            if not ped or len(ped) == 0:
 8793                msg_error = f"Error in pedigree: samples {ped_samples}"
 8794                log.error(msg_error)
 8795                raise ValueError(msg_error)
 8796
 8797            # Log
 8798            log.info(
 8799                "Calculation 'BARCODEFAMILY' - Samples: "
 8800                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 8801            )
 8802            log.debug(f"ped_samples={ped_samples}")
 8803
 8804            # Field
 8805            barcode_infos = prefix + tag
 8806
 8807            # Variants table
 8808            table_variants = self.get_table_variants()
 8809
 8810            # Header
 8811            vcf_reader = self.get_header()
 8812
 8813            # Create variant id
 8814            variant_id_column = self.get_variant_id_column()
 8815            added_columns = [variant_id_column]
 8816
 8817            # variant_id, FORMAT and samples
 8818            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8819                ped_samples
 8820            )
 8821
 8822            # Create dataframe
 8823            dataframe_barcode = self.get_query_to_df(
 8824                f""" SELECT {samples_fields} FROM {table_variants} """
 8825            )
 8826
 8827            # Create barcode column
 8828            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 8829                lambda row: barcode(row, samples=ped_samples), axis=1
 8830            )
 8831
 8832            # Add barcode family to header
 8833            # Add vaf_normalization to header
 8834            vcf_reader.formats[tag] = vcf.parser._Format(
 8835                id=tag,
 8836                num=".",
 8837                type="String",
 8838                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 8839                type_code=self.code_type_map.get("String"),
 8840            )
 8841            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 8842                id=f"{tag}S",
 8843                num=".",
 8844                type="String",
 8845                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 8846                type_code=self.code_type_map.get("String"),
 8847            )
 8848
 8849            # Update
 8850            # for sample in ped_samples:
 8851            sql_update_set = []
 8852            for sample in self.get_header_sample_list() + ["FORMAT"]:
 8853                if sample in ped_samples:
 8854                    value = f'dataframe_barcode."{barcode_infos}"'
 8855                    value_samples = "'" + ",".join(ped_samples) + "'"
 8856                elif sample == "FORMAT":
 8857                    value = f"'{tag}'"
 8858                    value_samples = f"'{tag}S'"
 8859                else:
 8860                    value = "'.'"
 8861                    value_samples = "'.'"
 8862                format_regex = r"[a-zA-Z0-9\s]"
 8863                sql_update_set.append(
 8864                    f"""
 8865                        "{sample}" = 
 8866                        concat(
 8867                            CASE
 8868                                WHEN {table_variants}."{sample}" = './.'
 8869                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 8870                                ELSE {table_variants}."{sample}"
 8871                            END,
 8872                            ':',
 8873                            {value},
 8874                            ':',
 8875                            {value_samples}
 8876                        )
 8877                    """
 8878                )
 8879
 8880            sql_update_set_join = ", ".join(sql_update_set)
 8881            sql_update = f"""
 8882                UPDATE {table_variants}
 8883                SET {sql_update_set_join}
 8884                FROM dataframe_barcode
 8885                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 8886            """
 8887            self.conn.execute(sql_update)
 8888
 8889            # Remove added columns
 8890            for added_column in added_columns:
 8891                self.drop_column(column=added_column)
 8892
 8893            # Delete dataframe
 8894            del dataframe_barcode
 8895            gc.collect()
 8896
 8897    def calculation_trio(self) -> None:
 8898        """
 8899        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 8900        information to the INFO field of each variant.
 8901        """
 8902
 8903        # if FORMAT and samples
 8904        if (
 8905            "FORMAT" in self.get_header_columns_as_list()
 8906            and self.get_header_sample_list()
 8907        ):
 8908
 8909            # trio annotation field
 8910            trio_tag = "trio"
 8911
 8912            # VCF infos tags
 8913            vcf_infos_tags = {
 8914                "trio": "trio calculation",
 8915            }
 8916
 8917            # Param
 8918            param = self.get_param()
 8919
 8920            # Prefix
 8921            prefix = self.get_explode_infos_prefix()
 8922
 8923            # Trio param
 8924            trio_ped = (
 8925                param.get("calculation", {})
 8926                .get("calculations", {})
 8927                .get("TRIO", {})
 8928                .get("trio_pedigree", None)
 8929            )
 8930
 8931            # Load trio
 8932            if trio_ped:
 8933
 8934                # Trio pedigree is a file
 8935                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 8936                    log.debug("TRIO pedigree is file")
 8937                    with open(full_path(trio_ped)) as trio_ped:
 8938                        trio_ped = json.load(trio_ped)
 8939
 8940                # Trio pedigree is a string
 8941                elif isinstance(trio_ped, str):
 8942                    log.debug("TRIO pedigree is str")
 8943                    try:
 8944                        trio_ped = json.loads(trio_ped)
 8945                        log.debug("TRIO pedigree is json str")
 8946                    except ValueError as e:
 8947                        trio_samples = trio_ped.split(",")
 8948                        if len(trio_samples) == 3:
 8949                            trio_ped = {
 8950                                "father": trio_samples[0],
 8951                                "mother": trio_samples[1],
 8952                                "child": trio_samples[2],
 8953                            }
 8954                            log.debug("TRIO pedigree is list str")
 8955                        else:
 8956                            msg_error = "TRIO pedigree not well formatted"
 8957                            log.error(msg_error)
 8958                            raise ValueError(msg_error)
 8959
 8960                # Trio pedigree is a dict
 8961                elif isinstance(trio_ped, dict):
 8962                    log.debug("TRIO pedigree is dict")
 8963
 8964                # Trio pedigree is not well formatted
 8965                else:
 8966                    msg_error = "TRIO pedigree not well formatted"
 8967                    log.error(msg_error)
 8968                    raise ValueError(msg_error)
 8969
 8970                # Construct trio list
 8971                trio_samples = [
 8972                    trio_ped.get("father", ""),
 8973                    trio_ped.get("mother", ""),
 8974                    trio_ped.get("child", ""),
 8975                ]
 8976
 8977            else:
 8978                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 8979                samples_list = self.get_header_sample_list()
 8980                if len(samples_list) >= 3:
 8981                    trio_samples = self.get_header_sample_list()[0:3]
 8982                    trio_ped = {
 8983                        "father": trio_samples[0],
 8984                        "mother": trio_samples[1],
 8985                        "child": trio_samples[2],
 8986                    }
 8987                else:
 8988                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 8989                    log.error(msg_error)
 8990                    raise ValueError(msg_error)
 8991
 8992            # Check trio pedigree
 8993            if not trio_ped or len(trio_ped) != 3:
 8994                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 8995                log.error(msg_error)
 8996                raise ValueError(msg_error)
 8997
 8998            # Log
 8999            log.info(
 9000                f"Calculation 'TRIO' - Samples: "
 9001                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9002            )
 9003
 9004            # Field
 9005            trio_infos = prefix + trio_tag
 9006
 9007            # Variants table
 9008            table_variants = self.get_table_variants()
 9009
 9010            # Header
 9011            vcf_reader = self.get_header()
 9012
 9013            # Create variant id
 9014            variant_id_column = self.get_variant_id_column()
 9015            added_columns = [variant_id_column]
 9016
 9017            # variant_id, FORMAT and samples
 9018            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9019                self.get_header_sample_list()
 9020            )
 9021
 9022            # Create dataframe
 9023            dataframe_trio = self.get_query_to_df(
 9024                f""" SELECT {samples_fields} FROM {table_variants} """
 9025            )
 9026
 9027            # Create trio column
 9028            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9029                lambda row: trio(row, samples=trio_samples), axis=1
 9030            )
 9031
 9032            # Add trio to header
 9033            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9034                trio_tag,
 9035                ".",
 9036                "String",
 9037                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9038                "howard calculation",
 9039                "0",
 9040                self.code_type_map.get("String"),
 9041            )
 9042
 9043            # Update
 9044            sql_update = f"""
 9045                UPDATE {table_variants}
 9046                SET "INFO" = 
 9047                    concat(
 9048                        CASE
 9049                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9050                            THEN ''
 9051                            ELSE concat("INFO", ';')
 9052                        END,
 9053                        CASE
 9054                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9055                             AND dataframe_trio."{trio_infos}" NOT NULL
 9056                            THEN concat(
 9057                                    '{trio_tag}=',
 9058                                    dataframe_trio."{trio_infos}"
 9059                                )
 9060                            ELSE ''
 9061                        END
 9062                    )
 9063                FROM dataframe_trio
 9064                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9065            """
 9066            self.conn.execute(sql_update)
 9067
 9068            # Remove added columns
 9069            for added_column in added_columns:
 9070                self.drop_column(column=added_column)
 9071
 9072            # Delete dataframe
 9073            del dataframe_trio
 9074            gc.collect()
 9075
 9076    def calculation_vaf_normalization(self) -> None:
 9077        """
 9078        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9079        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9080        :return: The function does not return anything.
 9081        """
 9082
 9083        # if FORMAT and samples
 9084        if (
 9085            "FORMAT" in self.get_header_columns_as_list()
 9086            and self.get_header_sample_list()
 9087        ):
 9088
 9089            # vaf_normalization annotation field
 9090            vaf_normalization_tag = "VAF"
 9091
 9092            # VCF infos tags
 9093            vcf_infos_tags = {
 9094                "VAF": "VAF Variant Frequency",
 9095            }
 9096
 9097            # Prefix
 9098            prefix = self.get_explode_infos_prefix()
 9099
 9100            # Variants table
 9101            table_variants = self.get_table_variants()
 9102
 9103            # Header
 9104            vcf_reader = self.get_header()
 9105
 9106            # Do not calculate if VAF already exists
 9107            if "VAF" in vcf_reader.formats:
 9108                log.debug("VAF already on genotypes")
 9109                return
 9110
 9111            # Create variant id
 9112            variant_id_column = self.get_variant_id_column()
 9113            added_columns = [variant_id_column]
 9114
 9115            # variant_id, FORMAT and samples
 9116            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9117                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9118            )
 9119
 9120            # Create dataframe
 9121            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9122            log.debug(f"query={query}")
 9123            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9124
 9125            vaf_normalization_set = []
 9126
 9127            # for each sample vaf_normalization
 9128            for sample in self.get_header_sample_list():
 9129                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9130                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9131                )
 9132                vaf_normalization_set.append(
 9133                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9134                )
 9135
 9136            # Add VAF to FORMAT
 9137            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9138                "FORMAT"
 9139            ].apply(lambda x: str(x) + ":VAF")
 9140            vaf_normalization_set.append(
 9141                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9142            )
 9143
 9144            # Add vaf_normalization to header
 9145            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9146                id=vaf_normalization_tag,
 9147                num="1",
 9148                type="Float",
 9149                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9150                type_code=self.code_type_map.get("Float"),
 9151            )
 9152
 9153            # Create fields to add in INFO
 9154            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9155
 9156            # Update
 9157            sql_update = f"""
 9158                UPDATE {table_variants}
 9159                SET {sql_vaf_normalization_set}
 9160                FROM dataframe_vaf_normalization
 9161                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9162
 9163            """
 9164            self.conn.execute(sql_update)
 9165
 9166            # Remove added columns
 9167            for added_column in added_columns:
 9168                self.drop_column(column=added_column)
 9169
 9170            # Delete dataframe
 9171            del dataframe_vaf_normalization
 9172            gc.collect()
 9173
 9174    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9175        """
 9176        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9177        field in a VCF file and updates the INFO column of the variants table with the calculated
 9178        statistics.
 9179
 9180        :param info: The `info` parameter is a string that represents the type of information for which
 9181        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9182        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9183        maximum value, the mean, the median, defaults to VAF
 9184        :type info: str (optional)
 9185        """
 9186
 9187        # if FORMAT and samples
 9188        if (
 9189            "FORMAT" in self.get_header_columns_as_list()
 9190            and self.get_header_sample_list()
 9191        ):
 9192
 9193            # vaf_stats annotation field
 9194            vaf_stats_tag = info + "_stats"
 9195
 9196            # VCF infos tags
 9197            vcf_infos_tags = {
 9198                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9199                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9200                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9201                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9202                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9203                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9204                info
 9205                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9206            }
 9207
 9208            # Prefix
 9209            prefix = self.get_explode_infos_prefix()
 9210
 9211            # Field
 9212            vaf_stats_infos = prefix + vaf_stats_tag
 9213
 9214            # Variants table
 9215            table_variants = self.get_table_variants()
 9216
 9217            # Header
 9218            vcf_reader = self.get_header()
 9219
 9220            # Create variant id
 9221            variant_id_column = self.get_variant_id_column()
 9222            added_columns = [variant_id_column]
 9223
 9224            # variant_id, FORMAT and samples
 9225            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9226                self.get_header_sample_list()
 9227            )
 9228
 9229            # Create dataframe
 9230            dataframe_vaf_stats = self.get_query_to_df(
 9231                f""" SELECT {samples_fields} FROM {table_variants} """
 9232            )
 9233
 9234            # Create vaf_stats column
 9235            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9236                lambda row: genotype_stats(
 9237                    row, samples=self.get_header_sample_list(), info=info
 9238                ),
 9239                axis=1,
 9240            )
 9241
 9242            # List of vcf tags
 9243            sql_vaf_stats_fields = []
 9244
 9245            # Check all VAF stats infos
 9246            for stat in vcf_infos_tags:
 9247
 9248                # Extract stats
 9249                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9250                    lambda x: dict(x).get(stat, "")
 9251                )
 9252
 9253                # Add snpeff_hgvs to header
 9254                vcf_reader.infos[stat] = vcf.parser._Info(
 9255                    stat,
 9256                    ".",
 9257                    "String",
 9258                    vcf_infos_tags.get(stat, "genotype statistics"),
 9259                    "howard calculation",
 9260                    "0",
 9261                    self.code_type_map.get("String"),
 9262                )
 9263
 9264                if len(sql_vaf_stats_fields):
 9265                    sep = ";"
 9266                else:
 9267                    sep = ""
 9268
 9269                # Create fields to add in INFO
 9270                sql_vaf_stats_fields.append(
 9271                    f"""
 9272                        CASE
 9273                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9274                            THEN concat(
 9275                                    '{sep}{stat}=',
 9276                                    dataframe_vaf_stats."{stat}"
 9277                                )
 9278                            ELSE ''
 9279                        END
 9280                    """
 9281                )
 9282
 9283            # SQL set for update
 9284            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9285
 9286            # Update
 9287            sql_update = f"""
 9288                UPDATE {table_variants}
 9289                SET "INFO" = 
 9290                    concat(
 9291                        CASE
 9292                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9293                            THEN ''
 9294                            ELSE concat("INFO", ';')
 9295                        END,
 9296                        {sql_vaf_stats_fields_set}
 9297                    )
 9298                FROM dataframe_vaf_stats
 9299                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9300
 9301            """
 9302            self.conn.execute(sql_update)
 9303
 9304            # Remove added columns
 9305            for added_column in added_columns:
 9306                self.drop_column(column=added_column)
 9307
 9308            # Delete dataframe
 9309            del dataframe_vaf_stats
 9310            gc.collect()
 9311
 9312    def calculation_transcripts_annotation(
 9313        self, info_json: str = None, info_format: str = None
 9314    ) -> None:
 9315        """
 9316        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
 9317        field to it if transcripts are available.
 9318
 9319        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
 9320        is a string parameter that represents the information field to be used in the transcripts JSON.
 9321        It is used to specify the JSON format for the transcripts information. If no value is provided
 9322        when calling the method, it defaults to "
 9323        :type info_json: str
 9324        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
 9325        method is a string parameter that specifies the format of the information field to be used in
 9326        the transcripts JSON. It is used to define the format of the information field
 9327        :type info_format: str
 9328        """
 9329
 9330        # Create transcripts table
 9331        transcripts_table = self.create_transcript_view()
 9332
 9333        # Add info field
 9334        if transcripts_table:
 9335            self.transcript_view_to_variants(
 9336                transcripts_table=transcripts_table,
 9337                transcripts_info_field_json=info_json,
 9338                transcripts_info_field_format=info_format,
 9339            )
 9340        else:
 9341            log.info("No Transcripts to process. Check param.json file configuration")
 9342
 9343    def calculation_transcripts_prioritization(self) -> None:
 9344        """
 9345        The function `calculation_transcripts_prioritization` creates a transcripts table and
 9346        prioritizes transcripts based on certain criteria.
 9347        """
 9348
 9349        # Create transcripts table
 9350        transcripts_table = self.create_transcript_view()
 9351
 9352        # Add info field
 9353        if transcripts_table:
 9354            self.transcripts_prioritization(transcripts_table=transcripts_table)
 9355        else:
 9356            log.info("No Transcripts to process. Check param.json file configuration")
 9357
 9358    ###############
 9359    # Transcripts #
 9360    ###############
 9361
 9362    def transcripts_prioritization(
 9363        self, transcripts_table: str = None, param: dict = {}
 9364    ) -> bool:
 9365        """
 9366        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
 9367        and updates the variants table with the prioritized information.
 9368
 9369        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
 9370        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
 9371        This parameter is used to identify the table where the transcripts data is stored for the
 9372        prioritization process
 9373        :type transcripts_table: str
 9374        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
 9375        that contains various configuration settings for the prioritization process of transcripts. It
 9376        is used to customize the behavior of the prioritization algorithm and includes settings such as
 9377        the prefix for prioritization fields, default profiles, and other
 9378        :type param: dict
 9379        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
 9380        transcripts prioritization process is successfully completed, and `False` if there are any
 9381        issues or if no profile is defined for transcripts prioritization.
 9382        """
 9383
 9384        log.debug("Start transcripts prioritization...")
 9385
 9386        # Param
 9387        if not param:
 9388            param = self.get_param()
 9389
 9390        # Variants table
 9391        table_variants = self.get_table_variants()
 9392        log.debug(f"transcripts_table={transcripts_table}")
 9393        # Transcripts table
 9394        if transcripts_table is None:
 9395            log.debug(f"transcripts_table={transcripts_table}")
 9396            transcripts_table = self.create_transcript_view(
 9397                transcripts_table="transcripts", param=param
 9398            )
 9399            log.debug(f"transcripts_table={transcripts_table}")
 9400        if transcripts_table is None:
 9401            msg_err = "No Transcripts table availalble"
 9402            log.error(msg_err)
 9403            raise ValueError(msg_err)
 9404
 9405        # Get transcripts columns
 9406        columns_as_list_query = f"""
 9407            DESCRIBE {transcripts_table}
 9408        """
 9409        columns_as_list = list(
 9410            self.get_query_to_df(columns_as_list_query)["column_name"]
 9411        )
 9412
 9413        # Create INFO if not exists
 9414        if "INFO" not in columns_as_list:
 9415            query_add_info = f"""
 9416                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
 9417            """
 9418            self.execute_query(query_add_info)
 9419
 9420        # Prioritization param and Force only PZ Score and Flag
 9421        pz_param = param.get("transcripts", {}).get("prioritization", {})
 9422        pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score"
 9423        pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag"
 9424        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
 9425        pz_param["pzfields"] = [pz_fields_score, pz_fields_flag]
 9426        pz_profile_default = (
 9427            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
 9428        )
 9429
 9430        # Exit if no profile
 9431        if pz_profile_default is None:
 9432            log.warning("No profile defined for transcripts prioritization")
 9433            return False
 9434
 9435        # Prioritization
 9436        prioritization_result = self.prioritization(
 9437            table=transcripts_table,
 9438            pz_param=param.get("transcripts", {}).get("prioritization", {}),
 9439        )
 9440        if not prioritization_result:
 9441            log.warning("Transcripts prioritization not processed")
 9442            return False
 9443
 9444        # Explode PZ fields
 9445        self.explode_infos(
 9446            table=transcripts_table,
 9447            fields=param.get("transcripts", {})
 9448            .get("prioritization", {})
 9449            .get("pzfields", []),
 9450        )
 9451
 9452        # Export Transcripts prioritization infos to variants table
 9453        query_update = f"""
 9454            WITH RankedTranscripts AS (
 9455                SELECT
 9456                    "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag},
 9457                    ROW_NUMBER() OVER (
 9458                        PARTITION BY "#CHROM", POS, REF, ALT
 9459                        ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC
 9460                    ) AS rn
 9461                FROM
 9462                    {transcripts_table}
 9463            )
 9464            UPDATE {table_variants}
 9465                SET
 9466                INFO = CONCAT(CASE
 9467                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9468                            THEN ''
 9469                            ELSE concat("INFO", ';')
 9470                        END,
 9471                        concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag})
 9472                        )
 9473            FROM
 9474                RankedTranscripts
 9475            WHERE
 9476                rn = 1
 9477                AND variants."#CHROM" = RankedTranscripts."#CHROM"
 9478                AND variants."POS" = RankedTranscripts."POS"
 9479                AND variants."REF" = RankedTranscripts."REF"
 9480                AND variants."ALT" = RankedTranscripts."ALT"
 9481                
 9482        """
 9483        self.execute_query(query=query_update)
 9484
 9485        # Add PZ Transcript in header
 9486        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
 9487            pz_fields_transcripts,
 9488            ".",
 9489            "String",
 9490            f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}",
 9491            "unknown",
 9492            "unknown",
 9493            code_type_map["String"],
 9494        )
 9495
 9496        # Return
 9497        return True
 9498
 9499    def create_transcript_view_from_columns_map(
 9500        self,
 9501        transcripts_table: str = "transcripts",
 9502        columns_maps: dict = {},
 9503        added_columns: list = [],
 9504        temporary_tables: list = None,
 9505        annotation_fields: list = None,
 9506    ) -> tuple[list, list, list]:
 9507        """
 9508        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
 9509        specified columns mapping for transcripts data.
 9510
 9511        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
 9512        the table where the transcripts data is stored or will be stored in the database. This table
 9513        typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores,
 9514        predictions, etc. It defaults to "transcripts, defaults to transcripts
 9515        :type transcripts_table: str (optional)
 9516        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about
 9517        how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list
 9518        represents a mapping configuration for a specific set of columns. It typically includes details such
 9519        as the main transcript column and additional information columns
 9520        :type columns_maps: dict
 9521        :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map`
 9522        function is a list that stores the additional columns that will be added to the view being created
 9523        based on the columns map provided. These columns are generated by exploding the transcript
 9524        information columns along with the main transcript column
 9525        :type added_columns: list
 9526        :param temporary_tables: The `temporary_tables` parameter in the
 9527        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
 9528        tables created during the process of creating a transcript view from a columns map. These temporary
 9529        tables are used to store intermediate results or transformations before the final view is generated
 9530        :type temporary_tables: list
 9531        :param annotation_fields: The `annotation_fields` parameter in the
 9532        `create_transcript_view_from_columns_map` function is a list that stores the fields that are used
 9533        for annotation in the query view creation process. These fields are extracted from the
 9534        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
 9535        :type annotation_fields: list
 9536        :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three
 9537        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
 9538        """
 9539
 9540        log.debug("Start transcrpts view creation from columns map...")
 9541
 9542        # "from_columns_map": [
 9543        #     {
 9544        #         "transcripts_column": "Ensembl_transcriptid",
 9545        #         "transcripts_infos_columns": [
 9546        #             "genename",
 9547        #             "Ensembl_geneid",
 9548        #             "LIST_S2_score",
 9549        #             "LIST_S2_pred",
 9550        #         ],
 9551        #     },
 9552        #     {
 9553        #         "transcripts_column": "Ensembl_transcriptid",
 9554        #         "transcripts_infos_columns": [
 9555        #             "genename",
 9556        #             "VARITY_R_score",
 9557        #             "Aloft_pred",
 9558        #         ],
 9559        #     },
 9560        # ],
 9561
 9562        # Init
 9563        if temporary_tables is None:
 9564            temporary_tables = []
 9565        if annotation_fields is None:
 9566            annotation_fields = []
 9567
 9568        # Variants table
 9569        table_variants = self.get_table_variants()
 9570
 9571        for columns_map in columns_maps:
 9572
 9573            # Transcript column
 9574            transcripts_column = columns_map.get("transcripts_column", None)
 9575
 9576            # Transcripts infos columns
 9577            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
 9578
 9579            if transcripts_column is not None:
 9580
 9581                # Explode
 9582                added_columns += self.explode_infos(
 9583                    fields=[transcripts_column] + transcripts_infos_columns
 9584                )
 9585
 9586                # View clauses
 9587                clause_select = []
 9588                for field in [transcripts_column] + transcripts_infos_columns:
 9589                    clause_select.append(
 9590                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
 9591                    )
 9592                    if field not in [transcripts_column]:
 9593                        annotation_fields.append(field)
 9594
 9595                # Querey View
 9596                query = f""" 
 9597                    SELECT
 9598                        "#CHROM", POS, REF, ALT,
 9599                        "{transcripts_column}" AS 'transcript',
 9600                        {", ".join(clause_select)}
 9601                    FROM (
 9602                        SELECT 
 9603                            "#CHROM", POS, REF, ALT,
 9604                            {", ".join(clause_select)}
 9605                        FROM {table_variants}
 9606                        )
 9607                    WHERE "{transcripts_column}" IS NOT NULL
 9608                """
 9609
 9610                # Create temporary table
 9611                temporary_table = transcripts_table + "".join(
 9612                    random.choices(string.ascii_uppercase + string.digits, k=10)
 9613                )
 9614
 9615                # Temporary_tables
 9616                temporary_tables.append(temporary_table)
 9617                query_view = f"""
 9618                    CREATE TEMPORARY TABLE {temporary_table}
 9619                    AS ({query})
 9620                """
 9621                self.execute_query(query=query_view)
 9622
 9623        return added_columns, temporary_tables, annotation_fields
 9624
 9625    def create_transcript_view_from_column_format(
 9626        self,
 9627        transcripts_table: str = "transcripts",
 9628        column_formats: dict = {},
 9629        temporary_tables: list = None,
 9630        annotation_fields: list = None,
 9631    ) -> tuple[list, list, list]:
 9632        """
 9633        The `create_transcript_view_from_column_format` function generates a transcript view based on
 9634        specified column formats, adds additional columns and annotation fields, and returns the list of
 9635        temporary tables and annotation fields.
 9636
 9637        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
 9638        the table containing the transcripts data. This table will be used as the base table for creating
 9639        the transcript view. The default value for this parameter is "transcripts", but you can provide a
 9640        different table name if needed, defaults to transcripts
 9641        :type transcripts_table: str (optional)
 9642        :param column_formats: The `column_formats` parameter is a dictionary that contains information
 9643        about the columns to be used for creating the transcript view. Each entry in the dictionary
 9644        specifies the mapping between a transcripts column and a transcripts infos column. For example, in
 9645        the provided code snippet:
 9646        :type column_formats: dict
 9647        :param temporary_tables: The `temporary_tables` parameter in the
 9648        `create_transcript_view_from_column_format` function is a list that stores the names of temporary
 9649        views created during the process of creating a transcript view from a column format. These temporary
 9650        views are used to manipulate and extract data before generating the final transcript view. It
 9651        :type temporary_tables: list
 9652        :param annotation_fields: The `annotation_fields` parameter in the
 9653        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
 9654        that are extracted from the temporary views created during the process. These annotation fields are
 9655        obtained by querying the temporary views and extracting the column names excluding specific columns
 9656        like `#CH
 9657        :type annotation_fields: list
 9658        :return: The `create_transcript_view_from_column_format` function returns two lists:
 9659        `temporary_tables` and `annotation_fields`.
 9660        """
 9661
 9662        log.debug("Start transcrpts view creation from column format...")
 9663
 9664        #  "from_column_format": [
 9665        #     {
 9666        #         "transcripts_column": "ANN",
 9667        #         "transcripts_infos_column": "Feature_ID",
 9668        #     }
 9669        # ],
 9670
 9671        # Init
 9672        if temporary_tables is None:
 9673            temporary_tables = []
 9674        if annotation_fields is None:
 9675            annotation_fields = []
 9676
 9677        for column_format in column_formats:
 9678
 9679            # annotation field and transcript annotation field
 9680            annotation_field = column_format.get("transcripts_column", "ANN")
 9681            transcript_annotation = column_format.get(
 9682                "transcripts_infos_column", "Feature_ID"
 9683            )
 9684
 9685            # Temporary View name
 9686            temporary_view_name = transcripts_table + "".join(
 9687                random.choices(string.ascii_uppercase + string.digits, k=10)
 9688            )
 9689
 9690            # Create temporary view name
 9691            temporary_view_name = self.annotation_format_to_table(
 9692                uniquify=True,
 9693                annotation_field=annotation_field,
 9694                view_name=temporary_view_name,
 9695                annotation_id=transcript_annotation,
 9696            )
 9697
 9698            # Annotation fields
 9699            if temporary_view_name:
 9700                query_annotation_fields = f"""
 9701                    SELECT *
 9702                    FROM (
 9703                        DESCRIBE SELECT *
 9704                        FROM {temporary_view_name}
 9705                        )
 9706                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
 9707                """
 9708                df_annotation_fields = self.get_query_to_df(
 9709                    query=query_annotation_fields
 9710                )
 9711
 9712                # Add temporary view and annotation fields
 9713                temporary_tables.append(temporary_view_name)
 9714                annotation_fields += list(set(df_annotation_fields["column_name"]))
 9715
 9716        return temporary_tables, annotation_fields
 9717
 9718    def create_transcript_view(
 9719        self,
 9720        transcripts_table: str = None,
 9721        transcripts_table_drop: bool = True,
 9722        param: dict = {},
 9723    ) -> str:
 9724        """
 9725        The `create_transcript_view` function generates a transcript view by processing data from a
 9726        specified table based on provided parameters and structural information.
 9727
 9728        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
 9729        is used to specify the name of the table that will store the final transcript view data. If a table
 9730        name is not provided, the function will create a new table to store the transcript view data, and by
 9731        default,, defaults to transcripts
 9732        :type transcripts_table: str (optional)
 9733        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
 9734        `create_transcript_view` function is a boolean parameter that determines whether to drop the
 9735        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
 9736        the function will drop the existing transcripts table if it exists, defaults to True
 9737        :type transcripts_table_drop: bool (optional)
 9738        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
 9739        contains information needed to create a transcript view. It includes details such as the structure
 9740        of the transcripts, columns mapping, column formats, and other necessary information for generating
 9741        the view. This parameter allows for flexibility and customization
 9742        :type param: dict
 9743        :return: The `create_transcript_view` function returns the name of the transcripts table that was
 9744        created or modified during the execution of the function.
 9745        """
 9746
 9747        log.debug("Start transcripts view creation...")
 9748
 9749        # Default
 9750        transcripts_table_default = "transcripts"
 9751
 9752        # Param
 9753        if not param:
 9754            param = self.get_param()
 9755
 9756        # Struct
 9757        struct = param.get("transcripts", {}).get("struct", None)
 9758
 9759        if struct:
 9760
 9761            # Transcripts table
 9762            if transcripts_table is None:
 9763                transcripts_table = param.get("transcripts", {}).get(
 9764                    "table", transcripts_table_default
 9765                )
 9766
 9767            # added_columns
 9768            added_columns = []
 9769
 9770            # Temporary tables
 9771            temporary_tables = []
 9772
 9773            # Annotation fields
 9774            annotation_fields = []
 9775
 9776            # from columns map
 9777            columns_maps = struct.get("from_columns_map", [])
 9778            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
 9779                self.create_transcript_view_from_columns_map(
 9780                    transcripts_table=transcripts_table,
 9781                    columns_maps=columns_maps,
 9782                    added_columns=added_columns,
 9783                    temporary_tables=temporary_tables,
 9784                    annotation_fields=annotation_fields,
 9785                )
 9786            )
 9787            added_columns += added_columns_tmp
 9788            temporary_tables += temporary_tables_tmp
 9789            annotation_fields += annotation_fields_tmp
 9790
 9791            # from column format
 9792            column_formats = struct.get("from_column_format", [])
 9793            temporary_tables_tmp, annotation_fields_tmp = (
 9794                self.create_transcript_view_from_column_format(
 9795                    transcripts_table=transcripts_table,
 9796                    column_formats=column_formats,
 9797                    temporary_tables=temporary_tables,
 9798                    annotation_fields=annotation_fields,
 9799                )
 9800            )
 9801            temporary_tables += temporary_tables_tmp
 9802            annotation_fields += annotation_fields_tmp
 9803
 9804            # Merge temporary tables query
 9805            query_merge = ""
 9806            for temporary_table in temporary_tables:
 9807
 9808                # First temporary table
 9809                if not query_merge:
 9810                    query_merge = f"""
 9811                        SELECT * FROM {temporary_table}
 9812                    """
 9813                # other temporary table (using UNION)
 9814                else:
 9815                    query_merge += f"""
 9816                        UNION BY NAME SELECT * FROM {temporary_table}
 9817                    """
 9818
 9819            # Merge on transcript
 9820            query_merge_on_transcripts_annotation_fields = []
 9821            # Aggregate all annotations fields
 9822            for annotation_field in set(annotation_fields):
 9823                query_merge_on_transcripts_annotation_fields.append(
 9824                    f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """
 9825                )
 9826            # Query for transcripts view
 9827            query_merge_on_transcripts = f"""
 9828                SELECT "#CHROM", POS, REF, ALT, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)}
 9829                FROM ({query_merge})
 9830                GROUP BY "#CHROM", POS, REF, ALT, transcript
 9831            """
 9832
 9833            # Drop transcript view is necessary
 9834            if transcripts_table_drop:
 9835                query_drop = f"""
 9836                    DROP TABLE IF EXISTS {transcripts_table};
 9837                """
 9838                self.execute_query(query=query_drop)
 9839
 9840            # Merge and create transcript view
 9841            query_create_view = f"""
 9842                CREATE TABLE IF NOT EXISTS {transcripts_table}
 9843                AS {query_merge_on_transcripts}
 9844            """
 9845            self.execute_query(query=query_create_view)
 9846
 9847            # Remove added columns
 9848            for added_column in added_columns:
 9849                self.drop_column(column=added_column)
 9850
 9851        else:
 9852
 9853            transcripts_table = None
 9854
 9855        return transcripts_table
 9856
 9857    def annotation_format_to_table(
 9858        self,
 9859        uniquify: bool = True,
 9860        annotation_field: str = "ANN",
 9861        annotation_id: str = "Feature_ID",
 9862        view_name: str = "transcripts",
 9863    ) -> str:
 9864        """
 9865        The function `annotation_format_to_table` converts annotation data from a VCF file into a structured
 9866        table format.
 9867
 9868        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique
 9869        values in the output or not. If set to `True`, the function will make sure that the output values
 9870        are unique, defaults to True
 9871        :type uniquify: bool (optional)
 9872        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that
 9873        contains the annotation information for each variant. This field is used to extract the annotation
 9874        details for further processing in the function, defaults to ANN
 9875        :type annotation_field: str (optional)
 9876        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is
 9877        used to specify the identifier for the annotation feature. This identifier will be used as a column
 9878        name in the resulting table or view that is created based on the annotation data. It helps in
 9879        uniquely identifying each annotation entry in the, defaults to Feature_ID
 9880        :type annotation_id: str (optional)
 9881        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to
 9882        specify the name of the temporary table that will be created to store the transformed annotation
 9883        data. This table will hold the extracted information from the annotation field in a structured
 9884        format for further processing or analysis, defaults to transcripts
 9885        :type view_name: str (optional)
 9886        :return: The function `annotation_format_to_table` is returning the name of the view created, which
 9887        is stored in the variable `view_name`.
 9888        """
 9889
 9890        # Annotation field
 9891        annotation_format = "annotation_explode"
 9892
 9893        # Transcript annotation
 9894        annotation_id = "".join(char for char in annotation_id if char.isalnum())
 9895
 9896        # Prefix
 9897        prefix = self.get_explode_infos_prefix()
 9898        if prefix:
 9899            prefix = "INFO/"
 9900
 9901        # Annotation fields
 9902        annotation_infos = prefix + annotation_field
 9903        annotation_format_infos = prefix + annotation_format
 9904
 9905        # Variants table
 9906        table_variants = self.get_table_variants()
 9907
 9908        # Header
 9909        vcf_reader = self.get_header()
 9910
 9911        # Add columns
 9912        added_columns = []
 9913
 9914        # Explode HGVS field in column
 9915        added_columns += self.explode_infos(fields=[annotation_field])
 9916
 9917        if annotation_field in vcf_reader.infos:
 9918
 9919            # Extract ANN header
 9920            ann_description = vcf_reader.infos[annotation_field].desc
 9921            pattern = r"'(.+?)'"
 9922            match = re.search(pattern, ann_description)
 9923            if match:
 9924                ann_header_match = match.group(1).split(" | ")
 9925                ann_header = []
 9926                ann_header_desc = {}
 9927                for i in range(len(ann_header_match)):
 9928                    ann_header_info = "".join(
 9929                        char for char in ann_header_match[i] if char.isalnum()
 9930                    )
 9931                    ann_header.append(ann_header_info)
 9932                    ann_header_desc[ann_header_info] = ann_header_match[i]
 9933                if not ann_header_desc:
 9934                    raise ValueError("Invalid header description format")
 9935            else:
 9936                raise ValueError("Invalid header description format")
 9937
 9938            # Create variant id
 9939            variant_id_column = self.get_variant_id_column()
 9940            added_columns += [variant_id_column]
 9941
 9942            # Create dataframe
 9943            dataframe_annotation_format = self.get_query_to_df(
 9944                f""" SELECT "#CHROM", POS, REF, ALT, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
 9945            )
 9946
 9947            # Create annotation columns
 9948            dataframe_annotation_format[
 9949                annotation_format_infos
 9950            ] = dataframe_annotation_format[annotation_infos].apply(
 9951                lambda x: explode_annotation_format(
 9952                    annotation=str(x),
 9953                    uniquify=uniquify,
 9954                    output_format="JSON",
 9955                    prefix="",
 9956                    header=list(ann_header_desc.values()),
 9957                )
 9958            )
 9959
 9960            # Find keys
 9961            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
 9962            df_keys = self.get_query_to_df(query=query_json)
 9963
 9964            # Check keys
 9965            query_json_key = []
 9966            for _, row in df_keys.iterrows():
 9967
 9968                # Key
 9969                key = row.iloc[0]
 9970
 9971                # key_clean
 9972                key_clean = "".join(char for char in key if char.isalnum())
 9973
 9974                # Type
 9975                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
 9976
 9977                # Get DataFrame from query
 9978                df_json_type = self.get_query_to_df(query=query_json_type)
 9979
 9980                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
 9981                with pd.option_context("future.no_silent_downcasting", True):
 9982                    df_json_type.fillna(value="", inplace=True)
 9983                    replace_dict = {None: np.nan, "": np.nan}
 9984                    df_json_type.replace(replace_dict, inplace=True)
 9985                    df_json_type.dropna(inplace=True)
 9986
 9987                # Detect column type
 9988                column_type = detect_column_type(df_json_type[key_clean])
 9989
 9990                # Append
 9991                query_json_key.append(
 9992                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
 9993                )
 9994
 9995            # Create view
 9996            query_view = f"""CREATE TEMPORARY TABLE {view_name} AS (SELECT *, {annotation_id} AS 'transcript' FROM (SELECT "#CHROM", POS, REF, ALT, {",".join(query_json_key)} FROM dataframe_annotation_format));"""
 9997            self.execute_query(query=query_view)
 9998
 9999        else:
10000
10001            # Return None
10002            view_name = None
10003
10004        # Remove added columns
10005        for added_column in added_columns:
10006            self.drop_column(column=added_column)
10007
10008        return view_name
10009
10010    def transcript_view_to_variants(
10011        self,
10012        transcripts_table: str = None,
10013        transcripts_column_id: str = None,
10014        transcripts_info_json: str = None,
10015        transcripts_info_field_json: str = None,
10016        transcripts_info_format: str = None,
10017        transcripts_info_field_format: str = None,
10018        param: dict = {},
10019    ) -> bool:
10020        """
10021        The `transcript_view_to_variants` function updates a variants table with information from
10022        transcripts in JSON format.
10023
10024        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
10025        table containing the transcripts data. If this parameter is not provided, the function will
10026        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
10027        :type transcripts_table: str
10028        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
10029        column in the `transcripts_table` that contains the unique identifier for each transcript. This
10030        identifier is used to match transcripts with variants in the database
10031        :type transcripts_column_id: str
10032        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
10033        of the column in the variants table where the transcripts information will be stored in JSON
10034        format. This parameter allows you to define the column in the variants table that will hold the
10035        JSON-formatted information about transcripts
10036        :type transcripts_info_json: str
10037        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
10038        specify the field in the VCF header that will contain information about transcripts in JSON
10039        format. This field will be added to the VCF header as an INFO field with the specified name
10040        :type transcripts_info_field_json: str
10041        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
10042        format of the information about transcripts that will be stored in the variants table. This
10043        format can be used to define how the transcript information will be structured or displayed
10044        within the variants table
10045        :type transcripts_info_format: str
10046        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
10047        specify the field in the VCF header that will contain information about transcripts in a
10048        specific format. This field will be added to the VCF header as an INFO field with the specified
10049        name
10050        :type transcripts_info_field_format: str
10051        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
10052        that contains various configuration settings related to transcripts. It is used to provide
10053        default values for certain parameters if they are not explicitly provided when calling the
10054        method. The `param` dictionary can be passed as an argument
10055        :type param: dict
10056        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
10057        if the operation is successful and `False` if certain conditions are not met.
10058        """
10059
10060        msg_info_prefix = "Start transcripts view to variants annotations"
10061
10062        log.debug(f"{msg_info_prefix}...")
10063
10064        # Default
10065        transcripts_table_default = "transcripts"
10066        transcripts_column_id_default = "transcript"
10067        transcripts_info_json_default = None
10068        transcripts_info_format_default = None
10069        transcripts_info_field_json_default = None
10070        transcripts_info_field_format_default = None
10071
10072        # Param
10073        if not param:
10074            param = self.get_param()
10075
10076        # Transcripts table
10077        if transcripts_table is None:
10078            transcripts_table = param.get("transcripts", {}).get(
10079                "table", transcripts_table_default
10080            )
10081
10082        # Transcripts column ID
10083        if transcripts_column_id is None:
10084            transcripts_column_id = param.get("transcripts", {}).get(
10085                "column_id", transcripts_column_id_default
10086            )
10087
10088        # Transcripts info json
10089        if transcripts_info_json is None:
10090            transcripts_info_json = param.get("transcripts", {}).get(
10091                "transcripts_info_json", transcripts_info_json_default
10092            )
10093
10094        # Transcripts info field JSON
10095        if transcripts_info_field_json is None:
10096            transcripts_info_field_json = param.get("transcripts", {}).get(
10097                "transcripts_info_field_json", transcripts_info_field_json_default
10098            )
10099        # if transcripts_info_field_json is not None and transcripts_info_json is None:
10100        #     transcripts_info_json = transcripts_info_field_json
10101
10102        # Transcripts info format
10103        if transcripts_info_format is None:
10104            transcripts_info_format = param.get("transcripts", {}).get(
10105                "transcripts_info_format", transcripts_info_format_default
10106            )
10107
10108        # Transcripts info field FORMAT
10109        if transcripts_info_field_format is None:
10110            transcripts_info_field_format = param.get("transcripts", {}).get(
10111                "transcripts_info_field_format", transcripts_info_field_format_default
10112            )
10113        # if (
10114        #     transcripts_info_field_format is not None
10115        #     and transcripts_info_format is None
10116        # ):
10117        #     transcripts_info_format = transcripts_info_field_format
10118
10119        # Variants table
10120        table_variants = self.get_table_variants()
10121
10122        # Check info columns param
10123        if (
10124            transcripts_info_json is None
10125            and transcripts_info_field_json is None
10126            and transcripts_info_format is None
10127            and transcripts_info_field_format is None
10128        ):
10129            return False
10130
10131        # Transcripts infos columns
10132        query_transcripts_infos_columns = f"""
10133            SELECT *
10134            FROM (
10135                DESCRIBE SELECT * FROM {transcripts_table}
10136                )
10137            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
10138        """
10139        transcripts_infos_columns = list(
10140            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
10141        )
10142
10143        # View results
10144        clause_select = []
10145        clause_to_json = []
10146        clause_to_format = []
10147        for field in transcripts_infos_columns:
10148            clause_select.append(
10149                f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10150            )
10151            clause_to_json.append(f""" '{field}': "{field}" """)
10152            clause_to_format.append(f""" "{field}" """)
10153
10154        # Update
10155        update_set_json = []
10156        update_set_format = []
10157
10158        # VCF header
10159        vcf_reader = self.get_header()
10160
10161        # Transcripts to info column in JSON
10162        if transcripts_info_json is not None:
10163
10164            # Create column on variants table
10165            self.add_column(
10166                table_name=table_variants,
10167                column_name=transcripts_info_json,
10168                column_type="JSON",
10169                default_value=None,
10170                drop=False,
10171            )
10172
10173            # Add header
10174            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
10175                transcripts_info_json,
10176                ".",
10177                "String",
10178                "Transcripts in JSON format",
10179                "unknwon",
10180                "unknwon",
10181                self.code_type_map["String"],
10182            )
10183
10184            # Add to update
10185            update_set_json.append(
10186                f""" {transcripts_info_json}=t.{transcripts_info_json} """
10187            )
10188
10189        # Transcripts to info field in JSON
10190        if transcripts_info_field_json is not None:
10191
10192            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
10193
10194            # Add to update
10195            update_set_json.append(
10196                f""" 
10197                    INFO = concat(
10198                            CASE
10199                                WHEN INFO NOT IN ('', '.')
10200                                THEN INFO
10201                                ELSE ''
10202                            END,
10203                            CASE
10204                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
10205                                THEN concat(
10206                                    ';{transcripts_info_field_json}=',
10207                                    t.{transcripts_info_json}
10208                                )
10209                                ELSE ''
10210                            END
10211                            )
10212                """
10213            )
10214
10215            # Add header
10216            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
10217                transcripts_info_field_json,
10218                ".",
10219                "String",
10220                "Transcripts in JSON format",
10221                "unknwon",
10222                "unknwon",
10223                self.code_type_map["String"],
10224            )
10225
10226        if update_set_json:
10227
10228            # Update query
10229            query_update = f"""
10230                UPDATE {table_variants}
10231                    SET {", ".join(update_set_json)}
10232                FROM
10233                (
10234                    SELECT
10235                        "#CHROM", POS, REF, ALT,
10236                            concat(
10237                            '{{',
10238                            string_agg(
10239                                '"' || "{transcripts_column_id}" || '":' ||
10240                                to_json(json_output)
10241                            ),
10242                            '}}'
10243                            )::JSON AS {transcripts_info_json}
10244                    FROM
10245                        (
10246                        SELECT
10247                            "#CHROM", POS, REF, ALT,
10248                            "{transcripts_column_id}",
10249                            to_json(
10250                                {{{",".join(clause_to_json)}}}
10251                            )::JSON AS json_output
10252                        FROM
10253                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10254                        WHERE "{transcripts_column_id}" IS NOT NULL
10255                        )
10256                    GROUP BY "#CHROM", POS, REF, ALT
10257                ) AS t
10258                WHERE {table_variants}."#CHROM" = t."#CHROM"
10259                    AND {table_variants}."POS" = t."POS"
10260                    AND {table_variants}."REF" = t."REF"
10261                    AND {table_variants}."ALT" = t."ALT"
10262            """
10263
10264            self.execute_query(query=query_update)
10265
10266        # Transcripts to info column in FORMAT
10267        if transcripts_info_format is not None:
10268
10269            # Create column on variants table
10270            self.add_column(
10271                table_name=table_variants,
10272                column_name=transcripts_info_format,
10273                column_type="VARCHAR",
10274                default_value=None,
10275                drop=False,
10276            )
10277
10278            # Add header
10279            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
10280                transcripts_info_format,
10281                ".",
10282                "String",
10283                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10284                "unknwon",
10285                "unknwon",
10286                self.code_type_map["String"],
10287            )
10288
10289            # Add to update
10290            update_set_format.append(
10291                f""" {transcripts_info_format}=t.{transcripts_info_format} """
10292            )
10293
10294        # Transcripts to info field in JSON
10295        if transcripts_info_field_format is not None:
10296
10297            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
10298
10299            # Add to update
10300            update_set_format.append(
10301                f""" 
10302                    INFO = concat(
10303                            CASE
10304                                WHEN INFO NOT IN ('', '.')
10305                                THEN INFO
10306                                ELSE ''
10307                            END,
10308                            CASE
10309                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
10310                                THEN concat(
10311                                    ';{transcripts_info_field_format}=',
10312                                    t.{transcripts_info_format}
10313                                )
10314                                ELSE ''
10315                            END
10316                            )
10317                """
10318            )
10319
10320            # Add header
10321            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
10322                transcripts_info_field_format,
10323                ".",
10324                "String",
10325                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10326                "unknwon",
10327                "unknwon",
10328                self.code_type_map["String"],
10329            )
10330
10331        if update_set_format:
10332
10333            # Update query
10334            query_update = f"""
10335                UPDATE {table_variants}
10336                    SET {", ".join(update_set_format)}
10337                FROM
10338                (
10339                    SELECT
10340                        "#CHROM", POS, REF, ALT,
10341                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
10342                    FROM 
10343                        (
10344                        SELECT
10345                            "#CHROM", POS, REF, ALT,
10346                            "{transcripts_column_id}",
10347                            concat(
10348                                "{transcripts_column_id}",
10349                                '|',
10350                                {", '|', ".join(clause_to_format)}
10351                            ) AS {transcripts_info_format}
10352                        FROM
10353                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10354                        )
10355                    GROUP BY "#CHROM", POS, REF, ALT
10356                ) AS t
10357                WHERE {table_variants}."#CHROM" = t."#CHROM"
10358                    AND {table_variants}."POS" = t."POS"
10359                    AND {table_variants}."REF" = t."REF"
10360                    AND {table_variants}."ALT" = t."ALT"
10361            """
10362
10363            self.execute_query(query=query_update)
10364
10365        return True
class Variants:
   34class Variants:
   35
   36    def __init__(
   37        self,
   38        conn=None,
   39        input: str = None,
   40        output: str = None,
   41        config: dict = {},
   42        param: dict = {},
   43        load: bool = False,
   44    ) -> None:
   45        """
   46        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   47        header
   48
   49        :param conn: the connection to the database
   50        :param input: the input file
   51        :param output: the output file
   52        :param config: a dictionary containing the configuration of the model
   53        :param param: a dictionary containing the parameters of the model
   54        """
   55
   56        # Init variables
   57        self.init_variables()
   58
   59        # Input
   60        self.set_input(input)
   61
   62        # Config
   63        self.set_config(config)
   64
   65        # Param
   66        self.set_param(param)
   67
   68        # Output
   69        self.set_output(output)
   70
   71        # connexion
   72        self.set_connexion(conn)
   73
   74        # Header
   75        self.set_header()
   76
   77        # Load data
   78        if load:
   79            self.load_data()
   80
   81    def set_input(self, input: str = None) -> None:
   82        """
   83        The function `set_input` takes a file name as input, extracts the name and extension, and sets
   84        attributes in the class accordingly.
   85
   86        :param input: The `set_input` method in the provided code snippet is used to set attributes
   87        related to the input file. Here's a breakdown of the parameters and their usage in the method:
   88        :type input: str
   89        """
   90
   91        if input and not isinstance(input, str):
   92            try:
   93                self.input = input.name
   94            except:
   95                log.error(f"Input file '{input} in bad format")
   96                raise ValueError(f"Input file '{input} in bad format")
   97        else:
   98            self.input = input
   99
  100        # Input format
  101        if input:
  102            input_name, input_extension = os.path.splitext(self.input)
  103            self.input_name = input_name
  104            self.input_extension = input_extension
  105            self.input_format = self.input_extension.replace(".", "")
  106
  107    def set_config(self, config: dict) -> None:
  108        """
  109        The set_config function takes a config object and assigns it as the configuration object for the
  110        class.
  111
  112        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  113        contains configuration settings for the class. When you call the `set_config` function with a
  114        dictionary object as the argument, it will set that dictionary as the configuration object for
  115        the class
  116        :type config: dict
  117        """
  118
  119        self.config = config
  120
  121    def set_param(self, param: dict) -> None:
  122        """
  123        This function sets a parameter object for the class based on the input dictionary.
  124
  125        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  126        as the `param` attribute of the class instance
  127        :type param: dict
  128        """
  129
  130        self.param = param
  131
  132    def init_variables(self) -> None:
  133        """
  134        This function initializes the variables that will be used in the rest of the class
  135        """
  136
  137        self.prefix = "howard"
  138        self.table_variants = "variants"
  139        self.dataframe = None
  140
  141        self.comparison_map = {
  142            "gt": ">",
  143            "gte": ">=",
  144            "lt": "<",
  145            "lte": "<=",
  146            "equals": "=",
  147            "contains": "SIMILAR TO",
  148        }
  149
  150        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  151
  152        self.code_type_map_to_sql = {
  153            "Integer": "INTEGER",
  154            "String": "VARCHAR",
  155            "Float": "FLOAT",
  156            "Flag": "VARCHAR",
  157        }
  158
  159        self.index_additionnal_fields = []
  160
  161    def get_indexing(self) -> bool:
  162        """
  163        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  164        returns False.
  165        :return: The value of the indexing parameter.
  166        """
  167
  168        return self.get_param().get("indexing", False)
  169
  170    def get_connexion_config(self) -> dict:
  171        """
  172        The function `get_connexion_config` returns a dictionary containing the configuration for a
  173        connection, including the number of threads and memory limit.
  174        :return: a dictionary containing the configuration for the Connexion library.
  175        """
  176
  177        # config
  178        config = self.get_config()
  179
  180        # Connexion config
  181        connexion_config = {}
  182        threads = self.get_threads()
  183
  184        # Threads
  185        if threads:
  186            connexion_config["threads"] = threads
  187
  188        # Memory
  189        # if config.get("memory", None):
  190        #     connexion_config["memory_limit"] = config.get("memory")
  191        if self.get_memory():
  192            connexion_config["memory_limit"] = self.get_memory()
  193
  194        # Temporary directory
  195        if config.get("tmp", None):
  196            connexion_config["temp_directory"] = config.get("tmp")
  197
  198        # Access
  199        if config.get("access", None):
  200            access = config.get("access")
  201            if access in ["RO"]:
  202                access = "READ_ONLY"
  203            elif access in ["RW"]:
  204                access = "READ_WRITE"
  205            connexion_db = self.get_connexion_db()
  206            if connexion_db in ":memory:":
  207                access = "READ_WRITE"
  208            connexion_config["access_mode"] = access
  209
  210        return connexion_config
  211
  212    def get_duckdb_settings(self) -> dict:
  213        """
  214        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  215        string.
  216        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  217        """
  218
  219        # config
  220        config = self.get_config()
  221
  222        # duckdb settings
  223        duckdb_settings_dict = {}
  224        if config.get("duckdb_settings", None):
  225            duckdb_settings = config.get("duckdb_settings")
  226            duckdb_settings = full_path(duckdb_settings)
  227            # duckdb setting is a file
  228            if os.path.exists(duckdb_settings):
  229                with open(duckdb_settings) as json_file:
  230                    duckdb_settings_dict = yaml.safe_load(json_file)
  231            # duckdb settings is a string
  232            else:
  233                duckdb_settings_dict = json.loads(duckdb_settings)
  234
  235        return duckdb_settings_dict
  236
  237    def set_connexion_db(self) -> str:
  238        """
  239        The function `set_connexion_db` returns the appropriate database connection string based on the
  240        input format and connection type.
  241        :return: the value of the variable `connexion_db`.
  242        """
  243
  244        # Default connexion db
  245        default_connexion_db = ":memory:"
  246
  247        # Find connexion db
  248        if self.get_input_format() in ["db", "duckdb"]:
  249            connexion_db = self.get_input()
  250        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  251            connexion_db = default_connexion_db
  252        elif self.get_connexion_type() in ["tmpfile"]:
  253            tmp_name = tempfile.mkdtemp(
  254                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  255            )
  256            connexion_db = f"{tmp_name}/tmp.db"
  257        elif self.get_connexion_type() != "":
  258            connexion_db = self.get_connexion_type()
  259        else:
  260            connexion_db = default_connexion_db
  261
  262        # Set connexion db
  263        self.connexion_db = connexion_db
  264
  265        return connexion_db
  266
  267    def set_connexion(self, conn) -> None:
  268        """
  269        The function `set_connexion` creates a connection to a database, with options for different
  270        database formats and settings.
  271
  272        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  273        database. If a connection is not provided, a new connection to an in-memory database is created.
  274        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  275        sqlite
  276        """
  277
  278        # Connexion db
  279        connexion_db = self.set_connexion_db()
  280
  281        # Connexion config
  282        connexion_config = self.get_connexion_config()
  283
  284        # Connexion format
  285        connexion_format = self.get_config().get("connexion_format", "duckdb")
  286        # Set connexion format
  287        self.connexion_format = connexion_format
  288
  289        # Connexion
  290        if not conn:
  291            if connexion_format in ["duckdb"]:
  292                conn = duckdb.connect(connexion_db, config=connexion_config)
  293                # duckDB settings
  294                duckdb_settings = self.get_duckdb_settings()
  295                if duckdb_settings:
  296                    for setting in duckdb_settings:
  297                        setting_value = duckdb_settings.get(setting)
  298                        if isinstance(setting_value, str):
  299                            setting_value = f"'{setting_value}'"
  300                        conn.execute(f"PRAGMA {setting}={setting_value};")
  301            elif connexion_format in ["sqlite"]:
  302                conn = sqlite3.connect(connexion_db)
  303
  304        # Set connexion
  305        self.conn = conn
  306
  307        # Log
  308        log.debug(f"connexion_format: {connexion_format}")
  309        log.debug(f"connexion_db: {connexion_db}")
  310        log.debug(f"connexion config: {connexion_config}")
  311        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  312
  313    def set_output(self, output: str = None) -> None:
  314        """
  315        The `set_output` function in Python sets the output file based on the input or a specified key
  316        in the config file, extracting the output name, extension, and format.
  317
  318        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  319        the output file. If the config file has an 'output' key, the method sets the output to the value
  320        of that key. If no output is provided, it sets the output to `None`
  321        :type output: str
  322        """
  323
  324        if output and not isinstance(output, str):
  325            self.output = output.name
  326        else:
  327            self.output = output
  328
  329        # Output format
  330        if self.output:
  331            output_name, output_extension = os.path.splitext(self.output)
  332            self.output_name = output_name
  333            self.output_extension = output_extension
  334            self.output_format = self.output_extension.replace(".", "")
  335        else:
  336            self.output_name = None
  337            self.output_extension = None
  338            self.output_format = None
  339
  340    def set_header(self) -> None:
  341        """
  342        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  343        """
  344
  345        input_file = self.get_input()
  346        default_header_list = [
  347            "##fileformat=VCFv4.2",
  348            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  349        ]
  350
  351        # Full path
  352        input_file = full_path(input_file)
  353
  354        if input_file:
  355
  356            input_format = self.get_input_format()
  357            input_compressed = self.get_input_compressed()
  358            config = self.get_config()
  359            header_list = default_header_list
  360            if input_format in [
  361                "vcf",
  362                "hdr",
  363                "tsv",
  364                "csv",
  365                "psv",
  366                "parquet",
  367                "db",
  368                "duckdb",
  369            ]:
  370                # header provided in param
  371                if config.get("header_file", None):
  372                    with open(config.get("header_file"), "rt") as f:
  373                        header_list = self.read_vcf_header(f)
  374                # within a vcf file format (header within input file itsself)
  375                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  376                    # within a compressed vcf file format (.vcf.gz)
  377                    if input_compressed:
  378                        with bgzf.open(input_file, "rt") as f:
  379                            header_list = self.read_vcf_header(f)
  380                    # within an uncompressed vcf file format (.vcf)
  381                    else:
  382                        with open(input_file, "rt") as f:
  383                            header_list = self.read_vcf_header(f)
  384                # header provided in default external file .hdr
  385                elif os.path.exists((input_file + ".hdr")):
  386                    with open(input_file + ".hdr", "rt") as f:
  387                        header_list = self.read_vcf_header(f)
  388                else:
  389                    try:  # Try to get header info fields and file columns
  390
  391                        with tempfile.TemporaryDirectory() as tmpdir:
  392
  393                            # Create database
  394                            db_for_header = Database(database=input_file)
  395
  396                            # Get header columns for infos fields
  397                            db_header_from_columns = (
  398                                db_for_header.get_header_from_columns()
  399                            )
  400
  401                            # Get real columns in the file
  402                            db_header_columns = db_for_header.get_columns()
  403
  404                            # Write header file
  405                            header_file_tmp = os.path.join(tmpdir, "header")
  406                            f = open(header_file_tmp, "w")
  407                            vcf.Writer(f, db_header_from_columns)
  408                            f.close()
  409
  410                            # Replace #CHROM line with rel columns
  411                            header_list = db_for_header.read_header_file(
  412                                header_file=header_file_tmp
  413                            )
  414                            header_list[-1] = "\t".join(db_header_columns)
  415
  416                    except:
  417
  418                        log.warning(
  419                            f"No header for file {input_file}. Set as default VCF header"
  420                        )
  421                        header_list = default_header_list
  422
  423            else:  # try for unknown format ?
  424
  425                log.error(f"Input file format '{input_format}' not available")
  426                raise ValueError(f"Input file format '{input_format}' not available")
  427
  428            if not header_list:
  429                header_list = default_header_list
  430
  431            # header as list
  432            self.header_list = header_list
  433
  434            # header as VCF object
  435            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  436
  437        else:
  438
  439            self.header_list = None
  440            self.header_vcf = None
  441
  442    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  443        """
  444        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  445        DataFrame based on the connection format.
  446
  447        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  448        represents the SQL query you want to execute. This query will be used to fetch data from a
  449        database and convert it into a pandas DataFrame
  450        :type query: str
  451        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  452        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  453        function will only fetch up to that number of rows from the database query result. If no limit
  454        is specified,
  455        :type limit: int
  456        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  457        """
  458
  459        # Connexion format
  460        connexion_format = self.get_connexion_format()
  461
  462        # Limit in query
  463        if limit:
  464            pd.set_option("display.max_rows", limit)
  465            if connexion_format in ["duckdb"]:
  466                df = (
  467                    self.conn.execute(query)
  468                    .fetch_record_batch(limit)
  469                    .read_next_batch()
  470                    .to_pandas()
  471                )
  472            elif connexion_format in ["sqlite"]:
  473                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  474
  475        # Full query
  476        else:
  477            if connexion_format in ["duckdb"]:
  478                df = self.conn.execute(query).df()
  479            elif connexion_format in ["sqlite"]:
  480                df = pd.read_sql_query(query, self.conn)
  481
  482        return df
  483
  484    def get_overview(self) -> None:
  485        """
  486        The function prints the input, output, config, and dataframe of the current object
  487        """
  488        table_variants_from = self.get_table_variants(clause="from")
  489        sql_columns = self.get_header_columns_as_sql()
  490        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  491        df = self.get_query_to_df(sql_query_export)
  492        log.info(
  493            "Input:  "
  494            + str(self.get_input())
  495            + " ["
  496            + str(str(self.get_input_format()))
  497            + "]"
  498        )
  499        log.info(
  500            "Output: "
  501            + str(self.get_output())
  502            + " ["
  503            + str(str(self.get_output_format()))
  504            + "]"
  505        )
  506        log.info("Config: ")
  507        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  508            "\n"
  509        ):
  510            log.info("\t" + str(d))
  511        log.info("Param: ")
  512        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  513            "\n"
  514        ):
  515            log.info("\t" + str(d))
  516        log.info("Sample list: " + str(self.get_header_sample_list()))
  517        log.info("Dataframe: ")
  518        for d in str(df).split("\n"):
  519            log.info("\t" + str(d))
  520
  521        # garbage collector
  522        del df
  523        gc.collect()
  524
  525        return None
  526
  527    def get_stats(self) -> dict:
  528        """
  529        The `get_stats` function calculates and returns various statistics of the current object,
  530        including information about the input file, variants, samples, header fields, quality, and
  531        SNVs/InDels.
  532        :return: a dictionary containing various statistics of the current object. The dictionary has
  533        the following structure:
  534        """
  535
  536        # Log
  537        log.info(f"Stats Calculation...")
  538
  539        # table varaints
  540        table_variants_from = self.get_table_variants()
  541
  542        # stats dict
  543        stats = {"Infos": {}}
  544
  545        ### File
  546        input_file = self.get_input()
  547        stats["Infos"]["Input file"] = input_file
  548
  549        # Header
  550        header_infos = self.get_header().infos
  551        header_formats = self.get_header().formats
  552        header_infos_list = list(header_infos)
  553        header_formats_list = list(header_formats)
  554
  555        ### Variants
  556
  557        stats["Variants"] = {}
  558
  559        # Variants by chr
  560        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  561        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  562        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  563            by=["CHROM"], kind="quicksort"
  564        )
  565
  566        # Total number of variants
  567        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  568
  569        # Calculate percentage
  570        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  571            lambda x: (x / nb_of_variants)
  572        )
  573
  574        stats["Variants"]["Number of variants by chromosome"] = (
  575            nb_of_variants_by_chrom.to_dict(orient="index")
  576        )
  577
  578        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  579
  580        ### Samples
  581
  582        # Init
  583        samples = {}
  584        nb_of_samples = 0
  585
  586        # Check Samples
  587        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  588            log.debug(f"Check samples...")
  589            for sample in self.get_header_sample_list():
  590                sql_query_samples = f"""
  591                    SELECT  '{sample}' as sample,
  592                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  593                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  594                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  595                    FROM {table_variants_from}
  596                    WHERE (
  597                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  598                        AND
  599                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  600                      )
  601                    GROUP BY genotype
  602                    """
  603                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  604                sample_genotype_count = sql_query_genotype_df["count"].sum()
  605                if len(sql_query_genotype_df):
  606                    nb_of_samples += 1
  607                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  608                        sql_query_genotype_df.to_dict(orient="index")
  609                    )
  610
  611            stats["Samples"] = samples
  612            stats["Infos"]["Number of samples"] = nb_of_samples
  613
  614        # #
  615        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  616        #     stats["Infos"]["Number of samples"] = nb_of_samples
  617        # elif nb_of_samples:
  618        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  619
  620        ### INFO and FORMAT fields
  621        header_types_df = {}
  622        header_types_list = {
  623            "List of INFO fields": header_infos,
  624            "List of FORMAT fields": header_formats,
  625        }
  626        i = 0
  627        for header_type in header_types_list:
  628
  629            header_type_infos = header_types_list.get(header_type)
  630            header_infos_dict = {}
  631
  632            for info in header_type_infos:
  633
  634                i += 1
  635                header_infos_dict[i] = {}
  636
  637                # ID
  638                header_infos_dict[i]["id"] = info
  639
  640                # num
  641                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  642                if header_type_infos[info].num in genotype_map.keys():
  643                    header_infos_dict[i]["Number"] = genotype_map.get(
  644                        header_type_infos[info].num
  645                    )
  646                else:
  647                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  648
  649                # type
  650                if header_type_infos[info].type:
  651                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  652                else:
  653                    header_infos_dict[i]["Type"] = "."
  654
  655                # desc
  656                if header_type_infos[info].desc != None:
  657                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  658                else:
  659                    header_infos_dict[i]["Description"] = ""
  660
  661            if len(header_infos_dict):
  662                header_types_df[header_type] = pd.DataFrame.from_dict(
  663                    header_infos_dict, orient="index"
  664                ).to_dict(orient="index")
  665
  666        # Stats
  667        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  668        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  669        stats["Header"] = header_types_df
  670
  671        ### QUAL
  672        if "QUAL" in self.get_header_columns():
  673            sql_query_qual = f"""
  674                    SELECT
  675                        avg(CAST(QUAL AS INTEGER)) AS Average,
  676                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  677                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  678                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  679                        median(CAST(QUAL AS INTEGER)) AS Median,
  680                        variance(CAST(QUAL AS INTEGER)) AS Variance
  681                    FROM {table_variants_from}
  682                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  683                    """
  684
  685            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  686            stats["Quality"] = {"Stats": qual}
  687
  688        ### SNV and InDel
  689
  690        sql_query_snv = f"""
  691            
  692            SELECT Type, count FROM (
  693
  694                    SELECT
  695                        'Total' AS Type,
  696                        count(*) AS count
  697                    FROM {table_variants_from}
  698
  699                    UNION
  700
  701                    SELECT
  702                        'MNV' AS Type,
  703                        count(*) AS count
  704                    FROM {table_variants_from}
  705                    WHERE len(REF) > 1 AND len(ALT) > 1
  706                    AND len(REF) = len(ALT)
  707
  708                    UNION
  709
  710                    SELECT
  711                        'InDel' AS Type,
  712                        count(*) AS count
  713                    FROM {table_variants_from}
  714                    WHERE len(REF) > 1 OR len(ALT) > 1
  715                    AND len(REF) != len(ALT)
  716                    
  717                    UNION
  718
  719                    SELECT
  720                        'SNV' AS Type,
  721                        count(*) AS count
  722                    FROM {table_variants_from}
  723                    WHERE len(REF) = 1 AND len(ALT) = 1
  724
  725                )
  726
  727            ORDER BY count DESC
  728
  729                """
  730        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  731
  732        sql_query_snv_substitution = f"""
  733                SELECT
  734                    concat(REF, '>', ALT) AS 'Substitution',
  735                    count(*) AS count
  736                FROM {table_variants_from}
  737                WHERE len(REF) = 1 AND len(ALT) = 1
  738                GROUP BY REF, ALT
  739                ORDER BY count(*) DESC
  740                """
  741        snv_substitution = (
  742            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  743        )
  744        stats["Variants"]["Counts"] = snv_indel
  745        stats["Variants"]["Substitutions"] = snv_substitution
  746
  747        return stats
  748
  749    def stats_to_file(self, file: str = None) -> str:
  750        """
  751        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  752        into a JSON object, and writes the JSON object to the specified file.
  753
  754        :param file: The `file` parameter is a string that represents the file path where the JSON data
  755        will be written
  756        :type file: str
  757        :return: the name of the file that was written to.
  758        """
  759
  760        # Get stats
  761        stats = self.get_stats()
  762
  763        # Serializing json
  764        json_object = json.dumps(stats, indent=4)
  765
  766        # Writing to sample.json
  767        with open(file, "w") as outfile:
  768            outfile.write(json_object)
  769
  770        return file
  771
  772    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  773        """
  774        The `print_stats` function generates a markdown file and prints the statistics contained in a
  775        JSON file in a formatted manner.
  776
  777        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  778        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  779        provided, a temporary directory will be created and the stats will be saved in a file named
  780        "stats.md" within that
  781        :type output_file: str
  782        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  783        file where the statistics will be saved. If no value is provided, a temporary directory will be
  784        created and a default file name "stats.json" will be used
  785        :type json_file: str
  786        :return: The function `print_stats` does not return any value. It has a return type annotation
  787        of `None`.
  788        """
  789
  790        # Full path
  791        output_file = full_path(output_file)
  792        json_file = full_path(json_file)
  793
  794        with tempfile.TemporaryDirectory() as tmpdir:
  795
  796            # Files
  797            if not output_file:
  798                output_file = os.path.join(tmpdir, "stats.md")
  799            if not json_file:
  800                json_file = os.path.join(tmpdir, "stats.json")
  801
  802            # Create folders
  803            if not os.path.exists(os.path.dirname(output_file)):
  804                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  805            if not os.path.exists(os.path.dirname(json_file)):
  806                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  807
  808            # Create stats JSON file
  809            stats_file = self.stats_to_file(file=json_file)
  810
  811            # Print stats file
  812            with open(stats_file) as f:
  813                stats = yaml.safe_load(f)
  814
  815            # Output
  816            output_title = []
  817            output_index = []
  818            output = []
  819
  820            # Title
  821            output_title.append("# HOWARD Stats")
  822
  823            # Index
  824            output_index.append("## Index")
  825
  826            # Process sections
  827            for section in stats:
  828                infos = stats.get(section)
  829                section_link = "#" + section.lower().replace(" ", "-")
  830                output.append(f"## {section}")
  831                output_index.append(f"- [{section}]({section_link})")
  832
  833                if len(infos):
  834                    for info in infos:
  835                        try:
  836                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  837                            is_df = True
  838                        except:
  839                            try:
  840                                df = pd.DataFrame.from_dict(
  841                                    json.loads((infos.get(info))), orient="index"
  842                                )
  843                                is_df = True
  844                            except:
  845                                is_df = False
  846                        if is_df:
  847                            output.append(f"### {info}")
  848                            info_link = "#" + info.lower().replace(" ", "-")
  849                            output_index.append(f"   - [{info}]({info_link})")
  850                            output.append(f"{df.to_markdown(index=False)}")
  851                        else:
  852                            output.append(f"- {info}: {infos.get(info)}")
  853                else:
  854                    output.append(f"NA")
  855
  856            # Write stats in markdown file
  857            with open(output_file, "w") as fp:
  858                for item in output_title:
  859                    fp.write("%s\n" % item)
  860                for item in output_index:
  861                    fp.write("%s\n" % item)
  862                for item in output:
  863                    fp.write("%s\n" % item)
  864
  865            # Output stats in markdown
  866            print("")
  867            print("\n\n".join(output_title))
  868            print("")
  869            print("\n\n".join(output))
  870            print("")
  871
  872        return None
  873
  874    def get_input(self) -> str:
  875        """
  876        It returns the value of the input variable.
  877        :return: The input is being returned.
  878        """
  879        return self.input
  880
  881    def get_input_format(self, input_file: str = None) -> str:
  882        """
  883        This function returns the format of the input variable, either from the provided input file or
  884        by prompting for input.
  885
  886        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  887        represents the file path of the input file. If no `input_file` is provided when calling the
  888        method, it will default to `None`
  889        :type input_file: str
  890        :return: The format of the input variable is being returned.
  891        """
  892
  893        if not input_file:
  894            input_file = self.get_input()
  895        input_format = get_file_format(input_file)
  896        return input_format
  897
  898    def get_input_compressed(self, input_file: str = None) -> str:
  899        """
  900        The function `get_input_compressed` returns the format of the input variable after compressing
  901        it.
  902
  903        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  904        that represents the file path of the input file. If no `input_file` is provided when calling the
  905        method, it will default to `None` and the method will then call `self.get_input()` to
  906        :type input_file: str
  907        :return: The function `get_input_compressed` returns the compressed format of the input
  908        variable.
  909        """
  910
  911        if not input_file:
  912            input_file = self.get_input()
  913        input_compressed = get_file_compressed(input_file)
  914        return input_compressed
  915
  916    def get_output(self) -> str:
  917        """
  918        It returns the output of the neuron.
  919        :return: The output of the neural network.
  920        """
  921
  922        return self.output
  923
  924    def get_output_format(self, output_file: str = None) -> str:
  925        """
  926        The function `get_output_format` returns the format of the input variable or the output file if
  927        provided.
  928
  929        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  930        that represents the file path of the output file. If no `output_file` is provided when calling
  931        the method, it will default to the output obtained from the `get_output` method of the class
  932        instance. The
  933        :type output_file: str
  934        :return: The format of the input variable is being returned.
  935        """
  936
  937        if not output_file:
  938            output_file = self.get_output()
  939        output_format = get_file_format(output_file)
  940
  941        return output_format
  942
  943    def get_config(self) -> dict:
  944        """
  945        It returns the config
  946        :return: The config variable is being returned.
  947        """
  948        return self.config
  949
  950    def get_param(self) -> dict:
  951        """
  952        It returns the param
  953        :return: The param variable is being returned.
  954        """
  955        return self.param
  956
  957    def get_connexion_db(self) -> str:
  958        """
  959        It returns the connexion_db attribute of the object
  960        :return: The connexion_db is being returned.
  961        """
  962        return self.connexion_db
  963
  964    def get_prefix(self) -> str:
  965        """
  966        It returns the prefix of the object.
  967        :return: The prefix is being returned.
  968        """
  969        return self.prefix
  970
  971    def get_table_variants(self, clause: str = "select") -> str:
  972        """
  973        This function returns the table_variants attribute of the object
  974
  975        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
  976        defaults to select (optional)
  977        :return: The table_variants attribute of the object.
  978        """
  979
  980        # Access
  981        access = self.get_config().get("access", None)
  982
  983        # Clauses "select", "where", "update"
  984        if clause in ["select", "where", "update"]:
  985            table_variants = self.table_variants
  986        # Clause "from"
  987        elif clause in ["from"]:
  988            # For Read Only
  989            if self.get_input_format() in ["parquet"] and access in ["RO"]:
  990                input_file = self.get_input()
  991                table_variants = f"'{input_file}' as variants"
  992            # For Read Write
  993            else:
  994                table_variants = f"{self.table_variants} as variants"
  995        else:
  996            table_variants = self.table_variants
  997        return table_variants
  998
  999    def get_tmp_dir(self) -> str:
 1000        """
 1001        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1002        parameters or a default path.
 1003        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1004        configuration, parameters, and a default value of "/tmp".
 1005        """
 1006
 1007        return get_tmp(
 1008            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1009        )
 1010
 1011    def get_connexion_type(self) -> str:
 1012        """
 1013        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1014
 1015        :return: The connexion type is being returned.
 1016        """
 1017        return self.get_config().get("connexion_type", "memory")
 1018
 1019    def get_connexion(self):
 1020        """
 1021        It returns the connection object
 1022
 1023        :return: The connection object.
 1024        """
 1025        return self.conn
 1026
 1027    def close_connexion(self) -> None:
 1028        """
 1029        This function closes the connection to the database.
 1030        :return: The connection is being closed.
 1031        """
 1032        return self.conn.close()
 1033
 1034    def get_header(self, type: str = "vcf"):
 1035        """
 1036        This function returns the header of the VCF file as a list of strings
 1037
 1038        :param type: the type of header you want to get, defaults to vcf (optional)
 1039        :return: The header of the vcf file.
 1040        """
 1041
 1042        if self.header_vcf:
 1043            if type == "vcf":
 1044                return self.header_vcf
 1045            elif type == "list":
 1046                return self.header_list
 1047        else:
 1048            if type == "vcf":
 1049                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1050                return header
 1051            elif type == "list":
 1052                return vcf_required
 1053
 1054    def get_header_length(self, file: str = None) -> int:
 1055        """
 1056        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1057        line.
 1058
 1059        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1060        header file. If this argument is provided, the function will read the header from the specified
 1061        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1062        :type file: str
 1063        :return: the length of the header list, excluding the #CHROM line.
 1064        """
 1065
 1066        if file:
 1067            return len(self.read_vcf_header_file(file=file)) - 1
 1068        elif self.get_header(type="list"):
 1069            return len(self.get_header(type="list")) - 1
 1070        else:
 1071            return 0
 1072
 1073    def get_header_columns(self) -> str:
 1074        """
 1075        This function returns the header list of a VCF
 1076
 1077        :return: The length of the header list.
 1078        """
 1079        if self.get_header():
 1080            return self.get_header(type="list")[-1]
 1081        else:
 1082            return ""
 1083
 1084    def get_header_columns_as_list(self) -> list:
 1085        """
 1086        This function returns the header list of a VCF
 1087
 1088        :return: The length of the header list.
 1089        """
 1090        if self.get_header():
 1091            return self.get_header_columns().strip().split("\t")
 1092        else:
 1093            return []
 1094
 1095    def get_header_columns_as_sql(self) -> str:
 1096        """
 1097        This function retruns header length (without #CHROM line)
 1098
 1099        :return: The length of the header list.
 1100        """
 1101        sql_column_list = []
 1102        for col in self.get_header_columns_as_list():
 1103            sql_column_list.append(f'"{col}"')
 1104        return ",".join(sql_column_list)
 1105
 1106    def get_header_sample_list(self) -> list:
 1107        """
 1108        This function retruns header length (without #CHROM line)
 1109
 1110        :return: The length of the header list.
 1111        """
 1112        return self.header_vcf.samples
 1113
 1114    def get_verbose(self) -> bool:
 1115        """
 1116        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1117        exist
 1118
 1119        :return: The value of the key "verbose" in the config dictionary.
 1120        """
 1121        return self.get_config().get("verbose", False)
 1122
 1123    def get_connexion_format(self) -> str:
 1124        """
 1125        It returns the connexion format of the object.
 1126        :return: The connexion_format is being returned.
 1127        """
 1128        connexion_format = self.connexion_format
 1129        if connexion_format not in ["duckdb", "sqlite"]:
 1130            log.error(f"Unknown connexion format {connexion_format}")
 1131            raise ValueError(f"Unknown connexion format {connexion_format}")
 1132        else:
 1133            return connexion_format
 1134
 1135    def insert_file_to_table(
 1136        self,
 1137        file,
 1138        columns: str,
 1139        header_len: int = 0,
 1140        sep: str = "\t",
 1141        chunksize: int = 1000000,
 1142    ) -> None:
 1143        """
 1144        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1145        database format.
 1146
 1147        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1148        the path to the file on your system
 1149        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1150        should contain the names of the columns in the table where the data will be inserted. The column
 1151        names should be separated by commas within the string. For example, if you have columns named
 1152        "id", "name
 1153        :type columns: str
 1154        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1155        the number of lines to skip at the beginning of the file before reading the actual data. This
 1156        parameter allows you to skip any header information present in the file before processing the
 1157        data, defaults to 0
 1158        :type header_len: int (optional)
 1159        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1160        separator character that is used in the file being read. In this case, the default separator is
 1161        set to `\t`, which represents a tab character. You can change this parameter to a different
 1162        separator character if, defaults to \t
 1163        :type sep: str (optional)
 1164        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1165        when processing the file in chunks. In the provided code snippet, the default value for
 1166        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1167        to 1000000
 1168        :type chunksize: int (optional)
 1169        """
 1170
 1171        # Config
 1172        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1173        connexion_format = self.get_connexion_format()
 1174
 1175        log.debug("chunksize: " + str(chunksize))
 1176
 1177        if chunksize:
 1178            for chunk in pd.read_csv(
 1179                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1180            ):
 1181                if connexion_format in ["duckdb"]:
 1182                    sql_insert_into = (
 1183                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1184                    )
 1185                    self.conn.execute(sql_insert_into)
 1186                elif connexion_format in ["sqlite"]:
 1187                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1188
 1189    def load_data(
 1190        self,
 1191        input_file: str = None,
 1192        drop_variants_table: bool = False,
 1193        sample_size: int = 20480,
 1194    ) -> None:
 1195        """
 1196        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1197        table before loading the data and specify a sample size.
 1198
 1199        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1200        table
 1201        :type input_file: str
 1202        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1203        determines whether the variants table should be dropped before loading the data. If set to
 1204        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1205        not be dropped, defaults to False
 1206        :type drop_variants_table: bool (optional)
 1207        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1208        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1209        20480
 1210        :type sample_size: int (optional)
 1211        """
 1212
 1213        log.info("Loading...")
 1214
 1215        # change input file
 1216        if input_file:
 1217            self.set_input(input_file)
 1218            self.set_header()
 1219
 1220        # drop variants table
 1221        if drop_variants_table:
 1222            self.drop_variants_table()
 1223
 1224        # get table variants
 1225        table_variants = self.get_table_variants()
 1226
 1227        # Access
 1228        access = self.get_config().get("access", None)
 1229        log.debug(f"access: {access}")
 1230
 1231        # Input format and compress
 1232        input_format = self.get_input_format()
 1233        input_compressed = self.get_input_compressed()
 1234        log.debug(f"input_format: {input_format}")
 1235        log.debug(f"input_compressed: {input_compressed}")
 1236
 1237        # input_compressed_format
 1238        if input_compressed:
 1239            input_compressed_format = "gzip"
 1240        else:
 1241            input_compressed_format = "none"
 1242        log.debug(f"input_compressed_format: {input_compressed_format}")
 1243
 1244        # Connexion format
 1245        connexion_format = self.get_connexion_format()
 1246
 1247        # Sample size
 1248        if not sample_size:
 1249            sample_size = -1
 1250        log.debug(f"sample_size: {sample_size}")
 1251
 1252        # Load data
 1253        log.debug(f"Load Data from {input_format}")
 1254
 1255        # DuckDB connexion
 1256        if connexion_format in ["duckdb"]:
 1257
 1258            # Database already exists
 1259            if self.input_format in ["db", "duckdb"]:
 1260
 1261                if connexion_format in ["duckdb"]:
 1262                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1263                else:
 1264                    log.error(
 1265                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1266                    )
 1267                    raise ValueError(
 1268                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1269                    )
 1270
 1271            # Load from existing database format
 1272            else:
 1273
 1274                try:
 1275                    # Create Table or View
 1276                    database = Database(database=self.input)
 1277                    sql_from = database.get_sql_from(sample_size=sample_size)
 1278
 1279                    if access in ["RO"]:
 1280                        sql_load = (
 1281                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1282                        )
 1283                    else:
 1284                        sql_load = (
 1285                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1286                        )
 1287                    self.conn.execute(sql_load)
 1288
 1289                except:
 1290                    # Format not available
 1291                    log.error(f"Input file format '{self.input_format}' not available")
 1292                    raise ValueError(
 1293                        f"Input file format '{self.input_format}' not available"
 1294                    )
 1295
 1296        # SQLite connexion
 1297        elif connexion_format in ["sqlite"] and input_format in [
 1298            "vcf",
 1299            "tsv",
 1300            "csv",
 1301            "psv",
 1302        ]:
 1303
 1304            # Main structure
 1305            structure = {
 1306                "#CHROM": "VARCHAR",
 1307                "POS": "INTEGER",
 1308                "ID": "VARCHAR",
 1309                "REF": "VARCHAR",
 1310                "ALT": "VARCHAR",
 1311                "QUAL": "VARCHAR",
 1312                "FILTER": "VARCHAR",
 1313                "INFO": "VARCHAR",
 1314            }
 1315
 1316            # Strcuture with samples
 1317            structure_complete = structure
 1318            if self.get_header_sample_list():
 1319                structure["FORMAT"] = "VARCHAR"
 1320                for sample in self.get_header_sample_list():
 1321                    structure_complete[sample] = "VARCHAR"
 1322
 1323            # Columns list for create and insert
 1324            sql_create_table_columns = []
 1325            sql_create_table_columns_list = []
 1326            for column in structure_complete:
 1327                column_type = structure_complete[column]
 1328                sql_create_table_columns.append(
 1329                    f'"{column}" {column_type} default NULL'
 1330                )
 1331                sql_create_table_columns_list.append(f'"{column}"')
 1332
 1333            # Create database
 1334            log.debug(f"Create Table {table_variants}")
 1335            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1336            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1337            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1338            self.conn.execute(sql_create_table)
 1339
 1340            # chunksize define length of file chunk load file
 1341            chunksize = 100000
 1342
 1343            # delimiter
 1344            delimiter = file_format_delimiters.get(input_format, "\t")
 1345
 1346            # Load the input file
 1347            with open(self.input, "rt") as input_file:
 1348
 1349                # Use the appropriate file handler based on the input format
 1350                if input_compressed:
 1351                    input_file = bgzf.open(self.input, "rt")
 1352                if input_format in ["vcf"]:
 1353                    header_len = self.get_header_length()
 1354                else:
 1355                    header_len = 0
 1356
 1357                # Insert the file contents into a table
 1358                self.insert_file_to_table(
 1359                    input_file,
 1360                    columns=sql_create_table_columns_list_sql,
 1361                    header_len=header_len,
 1362                    sep=delimiter,
 1363                    chunksize=chunksize,
 1364                )
 1365
 1366        else:
 1367            log.error(
 1368                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1369            )
 1370            raise ValueError(
 1371                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1372            )
 1373
 1374        # Explode INFOS fields into table fields
 1375        if self.get_explode_infos():
 1376            self.explode_infos(
 1377                prefix=self.get_explode_infos_prefix(),
 1378                fields=self.get_explode_infos_fields(),
 1379                force=True,
 1380            )
 1381
 1382        # Create index after insertion
 1383        self.create_indexes()
 1384
 1385    def get_explode_infos(self) -> bool:
 1386        """
 1387        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1388        to False if it is not set.
 1389        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1390        value. If the parameter is not present, it will return False.
 1391        """
 1392
 1393        return self.get_param().get("explode", {}).get("explode_infos", False)
 1394
 1395    def get_explode_infos_fields(
 1396        self,
 1397        explode_infos_fields: str = None,
 1398        remove_fields_not_in_header: bool = False,
 1399    ) -> list:
 1400        """
 1401        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1402        the input parameter `explode_infos_fields`.
 1403
 1404        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1405        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1406        comma-separated list of field names to explode
 1407        :type explode_infos_fields: str
 1408        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1409        flag that determines whether to remove fields that are not present in the header. If it is set
 1410        to `True`, any field that is not in the header will be excluded from the list of exploded
 1411        information fields. If it is set to `, defaults to False
 1412        :type remove_fields_not_in_header: bool (optional)
 1413        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1414        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1415        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1416        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1417        splitting the string by commas.
 1418        """
 1419
 1420        # If no fields, get it in param
 1421        if not explode_infos_fields:
 1422            explode_infos_fields = (
 1423                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1424            )
 1425
 1426        # If no fields, defined as all fields in header using keyword
 1427        if not explode_infos_fields:
 1428            explode_infos_fields = "*"
 1429
 1430        # If fields list not empty
 1431        if explode_infos_fields:
 1432
 1433            # Input fields list
 1434            if isinstance(explode_infos_fields, str):
 1435                fields_input = explode_infos_fields.split(",")
 1436            elif isinstance(explode_infos_fields, list):
 1437                fields_input = explode_infos_fields
 1438            else:
 1439                fields_input = []
 1440
 1441            # Fields list without * keyword
 1442            fields_without_all = fields_input.copy()
 1443            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1444                fields_without_all.remove("*")
 1445
 1446            # Fields in header
 1447            fields_in_header = sorted(list(set(self.get_header().infos)))
 1448
 1449            # Construct list of fields
 1450            fields_output = []
 1451            for field in fields_input:
 1452
 1453                # Strip field
 1454                field = field.strip()
 1455
 1456                # format keyword * in regex
 1457                if field.upper() in ["*"]:
 1458                    field = ".*"
 1459
 1460                # Find all fields with pattern
 1461                r = re.compile(field)
 1462                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1463
 1464                # Remove fields input from search
 1465                if field in fields_search:
 1466                    fields_search = [field]
 1467                elif fields_search != [field]:
 1468                    fields_search = sorted(
 1469                        list(set(fields_search).difference(fields_input))
 1470                    )
 1471
 1472                # If field is not in header (avoid not well formatted header)
 1473                if not fields_search and not remove_fields_not_in_header:
 1474                    fields_search = [field]
 1475
 1476                # Add found fields
 1477                for new_field in fields_search:
 1478                    # Add field, if not already exists, and if it is in header (if asked)
 1479                    if (
 1480                        new_field not in fields_output
 1481                        and (
 1482                            not remove_fields_not_in_header
 1483                            or new_field in fields_in_header
 1484                        )
 1485                        and new_field not in [".*"]
 1486                    ):
 1487                        fields_output.append(new_field)
 1488
 1489            return fields_output
 1490
 1491        else:
 1492
 1493            return []
 1494
 1495    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1496        """
 1497        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1498        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1499        not provided.
 1500
 1501        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1502        prefix to be used for exploding or expanding information
 1503        :type explode_infos_prefix: str
 1504        :return: the value of the variable `explode_infos_prefix`.
 1505        """
 1506
 1507        if not explode_infos_prefix:
 1508            explode_infos_prefix = (
 1509                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1510            )
 1511
 1512        return explode_infos_prefix
 1513
 1514    def add_column(
 1515        self,
 1516        table_name,
 1517        column_name,
 1518        column_type,
 1519        default_value=None,
 1520        drop: bool = False,
 1521    ) -> dict:
 1522        """
 1523        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1524        doesn't already exist.
 1525
 1526        :param table_name: The name of the table to which you want to add a column
 1527        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1528        to the table
 1529        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1530        want to add to the table. It should be a string that represents the desired data type, such as
 1531        "INTEGER", "TEXT", "REAL", etc
 1532        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1533        default value for the newly added column. If a default value is provided, it will be assigned to
 1534        the column for any existing rows that do not have a value for that column
 1535        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1536        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1537        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1538        to False
 1539        :type drop: bool (optional)
 1540        :return: a boolean value indicating whether the column was successfully added to the table.
 1541        """
 1542
 1543        # added
 1544        added = False
 1545        dropped = False
 1546
 1547        # Check if the column already exists in the table
 1548        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1549        columns = self.get_query_to_df(query).columns.tolist()
 1550        if column_name.upper() in [c.upper() for c in columns]:
 1551            log.debug(
 1552                f"The {column_name} column already exists in the {table_name} table"
 1553            )
 1554            if drop:
 1555                self.drop_column(table_name=table_name, column_name=column_name)
 1556                dropped = True
 1557            else:
 1558                return None
 1559        else:
 1560            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1561
 1562        # Add column in table
 1563        add_column_query = (
 1564            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1565        )
 1566        if default_value is not None:
 1567            add_column_query += f" DEFAULT {default_value}"
 1568        self.execute_query(add_column_query)
 1569        added = not dropped
 1570        log.debug(
 1571            f"The {column_name} column was successfully added to the {table_name} table"
 1572        )
 1573
 1574        if added:
 1575            added_column = {
 1576                "table_name": table_name,
 1577                "column_name": column_name,
 1578                "column_type": column_type,
 1579                "default_value": default_value,
 1580            }
 1581        else:
 1582            added_column = None
 1583
 1584        return added_column
 1585
 1586    def drop_column(
 1587        self, column: dict = None, table_name: str = None, column_name: str = None
 1588    ) -> bool:
 1589        """
 1590        The `drop_column` function drops a specified column from a given table in a database and returns
 1591        True if the column was successfully dropped, and False if the column does not exist in the
 1592        table.
 1593
 1594        :param column: The `column` parameter is a dictionary that contains information about the column
 1595        you want to drop. It has two keys:
 1596        :type column: dict
 1597        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1598        drop a column
 1599        :type table_name: str
 1600        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1601        from the table
 1602        :type column_name: str
 1603        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1604        and False if the column does not exist in the table.
 1605        """
 1606
 1607        # Find column infos
 1608        if column:
 1609            if isinstance(column, dict):
 1610                table_name = column.get("table_name", None)
 1611                column_name = column.get("column_name", None)
 1612            elif isinstance(column, str):
 1613                table_name = self.get_table_variants()
 1614                column_name = column
 1615            else:
 1616                table_name = None
 1617                column_name = None
 1618
 1619        if not table_name and not column_name:
 1620            return False
 1621
 1622        # Removed
 1623        removed = False
 1624
 1625        # Check if the column already exists in the table
 1626        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1627        columns = self.get_query_to_df(query).columns.tolist()
 1628        if column_name in columns:
 1629            log.debug(f"The {column_name} column exists in the {table_name} table")
 1630        else:
 1631            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1632            return False
 1633
 1634        # Add column in table # ALTER TABLE integers DROP k
 1635        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1636        self.execute_query(add_column_query)
 1637        removed = True
 1638        log.debug(
 1639            f"The {column_name} column was successfully dropped to the {table_name} table"
 1640        )
 1641
 1642        return removed
 1643
 1644    def explode_infos(
 1645        self,
 1646        prefix: str = None,
 1647        create_index: bool = False,
 1648        fields: list = None,
 1649        force: bool = False,
 1650        proccess_all_fields_together: bool = False,
 1651        table: str = None,
 1652    ) -> list:
 1653        """
 1654        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1655        individual columns, returning a list of added columns.
 1656
 1657        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1658        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1659        `self.get_explode_infos_prefix()` as the prefix
 1660        :type prefix: str
 1661        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1662        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1663        `False`, indexes will not be created. The default value is `False`, defaults to False
 1664        :type create_index: bool (optional)
 1665        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1666        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1667        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1668        a list to the `
 1669        :type fields: list
 1670        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1671        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1672        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1673        defaults to False
 1674        :type force: bool (optional)
 1675        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1676        flag that determines whether to process all the INFO fields together or individually. If set to
 1677        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1678        be processed individually. The default value is, defaults to False
 1679        :type proccess_all_fields_together: bool (optional)
 1680        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1681        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1682        a value for the `table` parameter, the function will use that table name. If the `table`
 1683        parameter is
 1684        :type table: str
 1685        :return: The `explode_infos` function returns a list of added columns.
 1686        """
 1687
 1688        # drop indexes
 1689        self.drop_indexes()
 1690
 1691        # connexion format
 1692        connexion_format = self.get_connexion_format()
 1693
 1694        # Access
 1695        access = self.get_config().get("access", None)
 1696
 1697        # Added columns
 1698        added_columns = []
 1699
 1700        if access not in ["RO"]:
 1701
 1702            # prefix
 1703            if prefix in [None, True] or not isinstance(prefix, str):
 1704                if self.get_explode_infos_prefix() not in [None, True]:
 1705                    prefix = self.get_explode_infos_prefix()
 1706                else:
 1707                    prefix = "INFO/"
 1708
 1709            # table variants
 1710            if table is not None:
 1711                table_variants = table
 1712            else:
 1713                table_variants = self.get_table_variants(clause="select")
 1714
 1715            # extra infos
 1716            try:
 1717                extra_infos = self.get_extra_infos()
 1718            except:
 1719                extra_infos = []
 1720
 1721            # Header infos
 1722            header_infos = self.get_header().infos
 1723
 1724            log.debug(
 1725                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1726            )
 1727
 1728            sql_info_alter_table_array = []
 1729
 1730            # Info fields to check
 1731            fields_list = list(header_infos)
 1732            if fields:
 1733                fields_list += fields
 1734            fields_list = set(fields_list)
 1735
 1736            # If no fields
 1737            if not fields:
 1738                fields = []
 1739
 1740            # Translate fields if patterns
 1741            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1742
 1743            for info in fields:
 1744
 1745                info_id_sql = prefix + info
 1746
 1747                if (
 1748                    info in fields_list
 1749                    or prefix + info in fields_list
 1750                    or info in extra_infos
 1751                ):
 1752
 1753                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1754
 1755                    if info in header_infos:
 1756                        info_type = header_infos[info].type
 1757                        info_num = header_infos[info].num
 1758                    else:
 1759                        info_type = "String"
 1760                        info_num = 0
 1761
 1762                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1763                    if info_num != 1:
 1764                        type_sql = "VARCHAR"
 1765
 1766                    # Add field
 1767                    added_column = self.add_column(
 1768                        table_name=table_variants,
 1769                        column_name=info_id_sql,
 1770                        column_type=type_sql,
 1771                        default_value="null",
 1772                        drop=force,
 1773                    )
 1774
 1775                    if added_column:
 1776                        added_columns.append(added_column)
 1777
 1778                    if added_column or force:
 1779
 1780                        # add field to index
 1781                        self.index_additionnal_fields.append(info_id_sql)
 1782
 1783                        # Update field array
 1784                        if connexion_format in ["duckdb"]:
 1785                            update_info_field = f"""
 1786                            "{info_id_sql}" =
 1787                                CASE
 1788                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1789                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1790                                END
 1791                            """
 1792                        elif connexion_format in ["sqlite"]:
 1793                            update_info_field = f"""
 1794                                "{info_id_sql}" =
 1795                                    CASE
 1796                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1797                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1798                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1799                                    END
 1800                            """
 1801
 1802                        sql_info_alter_table_array.append(update_info_field)
 1803
 1804            if sql_info_alter_table_array:
 1805
 1806                # By chromosomes
 1807                try:
 1808                    chromosomes_list = list(
 1809                        self.get_query_to_df(
 1810                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1811                        )["#CHROM"]
 1812                    )
 1813                except:
 1814                    chromosomes_list = [None]
 1815
 1816                for chrom in chromosomes_list:
 1817                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1818
 1819                    # Where clause
 1820                    where_clause = ""
 1821                    if chrom and len(chromosomes_list) > 1:
 1822                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1823
 1824                    # Update table
 1825                    if proccess_all_fields_together:
 1826                        sql_info_alter_table_array_join = ", ".join(
 1827                            sql_info_alter_table_array
 1828                        )
 1829                        if sql_info_alter_table_array_join:
 1830                            sql_info_alter_table = f"""
 1831                                UPDATE {table_variants}
 1832                                SET {sql_info_alter_table_array_join}
 1833                                {where_clause}
 1834                                """
 1835                            log.debug(
 1836                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1837                            )
 1838                            # log.debug(sql_info_alter_table)
 1839                            self.conn.execute(sql_info_alter_table)
 1840                    else:
 1841                        sql_info_alter_num = 0
 1842                        for sql_info_alter in sql_info_alter_table_array:
 1843                            sql_info_alter_num += 1
 1844                            sql_info_alter_table = f"""
 1845                                UPDATE {table_variants}
 1846                                SET {sql_info_alter}
 1847                                {where_clause}
 1848                                """
 1849                            log.debug(
 1850                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1851                            )
 1852                            # log.debug(sql_info_alter_table)
 1853                            self.conn.execute(sql_info_alter_table)
 1854
 1855        # create indexes
 1856        if create_index:
 1857            self.create_indexes()
 1858
 1859        return added_columns
 1860
 1861    def create_indexes(self) -> None:
 1862        """
 1863        Create indexes on the table after insertion
 1864        """
 1865
 1866        # Access
 1867        access = self.get_config().get("access", None)
 1868
 1869        # get table variants
 1870        table_variants = self.get_table_variants("FROM")
 1871
 1872        if self.get_indexing() and access not in ["RO"]:
 1873            # Create index
 1874            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 1875            self.conn.execute(sql_create_table_index)
 1876            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 1877            self.conn.execute(sql_create_table_index)
 1878            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 1879            self.conn.execute(sql_create_table_index)
 1880            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 1881            self.conn.execute(sql_create_table_index)
 1882            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 1883            self.conn.execute(sql_create_table_index)
 1884            for field in self.index_additionnal_fields:
 1885                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 1886                self.conn.execute(sql_create_table_index)
 1887
 1888    def drop_indexes(self) -> None:
 1889        """
 1890        Create indexes on the table after insertion
 1891        """
 1892
 1893        # Access
 1894        access = self.get_config().get("access", None)
 1895
 1896        # get table variants
 1897        table_variants = self.get_table_variants("FROM")
 1898
 1899        # Get database format
 1900        connexion_format = self.get_connexion_format()
 1901
 1902        if access not in ["RO"]:
 1903            if connexion_format in ["duckdb"]:
 1904                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 1905            elif connexion_format in ["sqlite"]:
 1906                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 1907
 1908            list_indexes = self.conn.execute(sql_list_indexes)
 1909            index_names = [row[0] for row in list_indexes.fetchall()]
 1910            for index in index_names:
 1911                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 1912                self.conn.execute(sql_drop_table_index)
 1913
 1914    def read_vcf_header(self, f) -> list:
 1915        """
 1916        It reads the header of a VCF file and returns a list of the header lines
 1917
 1918        :param f: the file object
 1919        :return: The header lines of the VCF file.
 1920        """
 1921
 1922        header_list = []
 1923        for line in f:
 1924            header_list.append(line)
 1925            if line.startswith("#CHROM"):
 1926                break
 1927        return header_list
 1928
 1929    def read_vcf_header_file(self, file: str = None) -> list:
 1930        """
 1931        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 1932        uncompressed files.
 1933
 1934        :param file: The `file` parameter is a string that represents the path to the VCF header file
 1935        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 1936        default to `None`
 1937        :type file: str
 1938        :return: The function `read_vcf_header_file` returns a list.
 1939        """
 1940
 1941        if self.get_input_compressed(input_file=file):
 1942            with bgzf.open(file, "rt") as f:
 1943                return self.read_vcf_header(f=f)
 1944        else:
 1945            with open(file, "rt") as f:
 1946                return self.read_vcf_header(f=f)
 1947
 1948    def execute_query(self, query: str):
 1949        """
 1950        It takes a query as an argument, executes it, and returns the results
 1951
 1952        :param query: The query to be executed
 1953        :return: The result of the query is being returned.
 1954        """
 1955        if query:
 1956            return self.conn.execute(query)  # .fetchall()
 1957        else:
 1958            return None
 1959
 1960    def export_output(
 1961        self,
 1962        output_file: str | None = None,
 1963        output_header: str | None = None,
 1964        export_header: bool = True,
 1965        query: str | None = None,
 1966        parquet_partitions: list | None = None,
 1967        chunk_size: int | None = None,
 1968        threads: int | None = None,
 1969        sort: bool = False,
 1970        index: bool = False,
 1971        order_by: str | None = None,
 1972    ) -> bool:
 1973        """
 1974        The `export_output` function exports data from a VCF file to a specified output file in various
 1975        formats, including VCF, CSV, TSV, PSV, and Parquet.
 1976
 1977        :param output_file: The `output_file` parameter is a string that specifies the name of the
 1978        output file to be generated by the function. This is where the exported data will be saved
 1979        :type output_file: str
 1980        :param output_header: The `output_header` parameter is a string that specifies the name of the
 1981        file where the header of the VCF file will be exported. If this parameter is not provided, the
 1982        header will be exported to a file with the same name as the `output_file` parameter, but with
 1983        the extension "
 1984        :type output_header: str
 1985        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 1986        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 1987        True, the header will be exported to a file. If `export_header` is False, the header will not
 1988        be, defaults to True, if output format is not VCF
 1989        :type export_header: bool (optional)
 1990        :param query: The `query` parameter is an optional SQL query that can be used to filter and
 1991        select specific data from the VCF file before exporting it. If provided, only the data that
 1992        matches the query will be exported
 1993        :type query: str
 1994        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 1995        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 1996        organize data in a hierarchical directory structure based on the values of one or more columns.
 1997        This can improve query performance when working with large datasets
 1998        :type parquet_partitions: list
 1999        :param chunk_size: The `chunk_size` parameter specifies the number of
 2000        records in batch when exporting data in Parquet format. This parameter is used for
 2001        partitioning the Parquet file into multiple files.
 2002        :type chunk_size: int
 2003        :param threads: The `threads` parameter is an optional parameter that specifies the number of
 2004        threads to be used during the export process. It determines the level of parallelism and can
 2005        improve the performance of the export operation. If not provided, the function will use the
 2006        default number of threads
 2007        :type threads: int
 2008        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
 2009        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
 2010        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
 2011        False
 2012        :type sort: bool (optional)
 2013        :param index: The `index` parameter is a boolean flag that determines whether an index should be
 2014        created on the output file. If `index` is True, an index will be created. If `index` is False,
 2015        no index will be created. The default value is False, defaults to False
 2016        :type index: bool (optional)
 2017        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
 2018        sorting the output file. This parameter is only applicable when exporting data in VCF format
 2019        :type order_by: str
 2020        :return: a boolean value. It checks if the output file exists and returns True if it does, or
 2021        None if it doesn't.
 2022        """
 2023
 2024        # Log
 2025        log.info("Exporting...")
 2026
 2027        # Full path
 2028        output_file = full_path(output_file)
 2029        output_header = full_path(output_header)
 2030
 2031        # Config
 2032        config = self.get_config()
 2033
 2034        # Param
 2035        param = self.get_param()
 2036
 2037        # Tmp files to remove
 2038        tmp_to_remove = []
 2039
 2040        # If no output, get it
 2041        if not output_file:
 2042            output_file = self.get_output()
 2043
 2044        # If not threads
 2045        if not threads:
 2046            threads = self.get_threads()
 2047
 2048        # Auto header name with extension
 2049        if export_header or output_header:
 2050            if not output_header:
 2051                output_header = f"{output_file}.hdr"
 2052            # Export header
 2053            self.export_header(output_file=output_file)
 2054
 2055        # Switch off export header if VCF output
 2056        output_file_type = get_file_format(output_file)
 2057        if output_file_type in ["vcf"]:
 2058            export_header = False
 2059            tmp_to_remove.append(output_header)
 2060
 2061        # Chunk size
 2062        if not chunk_size:
 2063            chunk_size = config.get("chunk_size", None)
 2064
 2065        # Parquet partition
 2066        if not parquet_partitions:
 2067            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2068        if parquet_partitions and isinstance(parquet_partitions, str):
 2069            parquet_partitions = parquet_partitions.split(",")
 2070
 2071        # Order by
 2072        if not order_by:
 2073            order_by = param.get("export", {}).get("order_by", "")
 2074
 2075        # Header in output
 2076        header_in_output = param.get("export", {}).get("include_header", False)
 2077
 2078        # Database
 2079        database_source = self.get_connexion()
 2080
 2081        # Connexion format
 2082        connexion_format = self.get_connexion_format()
 2083
 2084        # Explode infos
 2085        if self.get_explode_infos():
 2086            self.explode_infos(
 2087                prefix=self.get_explode_infos_prefix(),
 2088                fields=self.get_explode_infos_fields(),
 2089                force=False,
 2090            )
 2091
 2092        # if connexion_format in ["sqlite"] or query:
 2093        if connexion_format in ["sqlite"]:
 2094
 2095            # Export in Parquet
 2096            random_tmp = "".join(
 2097                random.choice(string.ascii_lowercase) for i in range(10)
 2098            )
 2099            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2100            tmp_to_remove.append(database_source)
 2101
 2102            # Table Variants
 2103            table_variants = self.get_table_variants()
 2104
 2105            # Create export query
 2106            sql_query_export_subquery = f"""
 2107                SELECT * FROM {table_variants}
 2108                """
 2109
 2110            # Write source file
 2111            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2112
 2113        # Create database
 2114        database = Database(
 2115            database=database_source,
 2116            table="variants",
 2117            header_file=output_header,
 2118            conn_config=self.get_connexion_config(),
 2119        )
 2120
 2121        # Existing colomns header
 2122        # existing_columns_header = database.get_header_file_columns(output_header)
 2123        existing_columns_header = database.get_header_columns_from_database()
 2124
 2125        # Export file
 2126        database.export(
 2127            output_database=output_file,
 2128            output_header=output_header,
 2129            existing_columns_header=existing_columns_header,
 2130            parquet_partitions=parquet_partitions,
 2131            chunk_size=chunk_size,
 2132            threads=threads,
 2133            sort=sort,
 2134            index=index,
 2135            header_in_output=header_in_output,
 2136            order_by=order_by,
 2137            query=query,
 2138            export_header=export_header,
 2139        )
 2140
 2141        # Remove
 2142        remove_if_exists(tmp_to_remove)
 2143
 2144        return (os.path.exists(output_file) or None) and (
 2145            os.path.exists(output_file) or None
 2146        )
 2147
 2148    def get_extra_infos(self, table: str = None) -> list:
 2149        """
 2150        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2151        in the header.
 2152
 2153        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2154        name of the table from which you want to retrieve the extra columns that are not present in the
 2155        header. If the `table` parameter is not provided when calling the function, it will default to
 2156        using the variants
 2157        :type table: str
 2158        :return: A list of columns that are in the specified table but not in the header of the table.
 2159        """
 2160
 2161        header_columns = []
 2162
 2163        if not table:
 2164            table = self.get_table_variants(clause="from")
 2165            header_columns = self.get_header_columns()
 2166
 2167        # Check all columns in the database
 2168        query = f""" SELECT * FROM {table} LIMIT 1 """
 2169        log.debug(f"query {query}")
 2170        table_columns = self.get_query_to_df(query).columns.tolist()
 2171        extra_columns = []
 2172
 2173        # Construct extra infos (not in header)
 2174        for column in table_columns:
 2175            if column not in header_columns:
 2176                extra_columns.append(column)
 2177
 2178        return extra_columns
 2179
 2180    def get_extra_infos_sql(self, table: str = None) -> str:
 2181        """
 2182        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2183        by double quotes
 2184
 2185        :param table: The name of the table to get the extra infos from. If None, the default table is
 2186        used
 2187        :type table: str
 2188        :return: A string of the extra infos
 2189        """
 2190
 2191        return ", ".join(
 2192            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2193        )
 2194
 2195    def export_header(
 2196        self,
 2197        header_name: str = None,
 2198        output_file: str = None,
 2199        output_file_ext: str = ".hdr",
 2200        clean_header: bool = True,
 2201        remove_chrom_line: bool = False,
 2202    ) -> str:
 2203        """
 2204        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2205        specified options, and writes it to a new file.
 2206
 2207        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2208        this parameter is not specified, the header will be written to the output file
 2209        :type header_name: str
 2210        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2211        specify the name of the output file where the header will be written. If this parameter is not
 2212        provided, the header will be written to a temporary file
 2213        :type output_file: str
 2214        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2215        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2216        if not specified by the user. This extension will be appended to the `output_file` name to
 2217        create the final, defaults to .hdr
 2218        :type output_file_ext: str (optional)
 2219        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2220        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2221        `True`, the function will clean the header by modifying certain lines based on a specific
 2222        pattern. If `clean_header`, defaults to True
 2223        :type clean_header: bool (optional)
 2224        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2225        boolean flag that determines whether the #CHROM line should be removed from the header before
 2226        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2227        defaults to False
 2228        :type remove_chrom_line: bool (optional)
 2229        :return: The function `export_header` returns the name of the temporary header file that is
 2230        created.
 2231        """
 2232
 2233        if not header_name and not output_file:
 2234            output_file = self.get_output()
 2235
 2236        if self.get_header():
 2237
 2238            # Get header object
 2239            header_obj = self.get_header()
 2240
 2241            # Create database
 2242            db_for_header = Database(database=self.get_input())
 2243
 2244            # Get real columns in the file
 2245            db_header_columns = db_for_header.get_columns()
 2246
 2247            with tempfile.TemporaryDirectory() as tmpdir:
 2248
 2249                # Write header file
 2250                header_file_tmp = os.path.join(tmpdir, "header")
 2251                f = open(header_file_tmp, "w")
 2252                vcf.Writer(f, header_obj)
 2253                f.close()
 2254
 2255                # Replace #CHROM line with rel columns
 2256                header_list = db_for_header.read_header_file(
 2257                    header_file=header_file_tmp
 2258                )
 2259                header_list[-1] = "\t".join(db_header_columns)
 2260
 2261                # Remove CHROM line
 2262                if remove_chrom_line:
 2263                    header_list.pop()
 2264
 2265                # Clean header
 2266                if clean_header:
 2267                    header_list_clean = []
 2268                    for head in header_list:
 2269                        # Clean head for malformed header
 2270                        head_clean = head
 2271                        head_clean = re.subn(
 2272                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2273                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2274                            head_clean,
 2275                            2,
 2276                        )[0]
 2277                        # Write header
 2278                        header_list_clean.append(head_clean)
 2279                    header_list = header_list_clean
 2280
 2281            tmp_header_name = output_file + output_file_ext
 2282
 2283            f = open(tmp_header_name, "w")
 2284            for line in header_list:
 2285                f.write(line)
 2286            f.close()
 2287
 2288        return tmp_header_name
 2289
 2290    def export_variant_vcf(
 2291        self,
 2292        vcf_file,
 2293        remove_info: bool = False,
 2294        add_samples: bool = True,
 2295        list_samples: list = [],
 2296        where_clause: str = "",
 2297        index: bool = False,
 2298        threads: int | None = None,
 2299    ) -> bool | None:
 2300        """
 2301        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2302        remove INFO field, add samples, and control compression and indexing.
 2303
 2304        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2305        written to. It is the output file that will contain the filtered VCF data based on the specified
 2306        parameters
 2307        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2308        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2309        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2310        in, defaults to False
 2311        :type remove_info: bool (optional)
 2312        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2313        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2314        If set to False, the samples will be removed. The default value is True, defaults to True
 2315        :type add_samples: bool (optional)
 2316        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2317        in the output VCF file. By default, all samples will be included. If you provide a list of
 2318        samples, only those samples will be included in the output file
 2319        :type list_samples: list
 2320        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2321        determines whether or not to create an index for the output VCF file. If `index` is set to
 2322        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2323        :type index: bool (optional)
 2324        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2325        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2326        will be used during the export process. More threads can potentially speed up the export process
 2327        by utilizing multiple cores of the processor. If
 2328        :type threads: int | None
 2329        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2330        method with various parameters including the output file, query, threads, sort flag, and index
 2331        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2332        specified parameters and configurations provided in the `export_variant_vcf` function.
 2333        """
 2334
 2335        # Config
 2336        config = self.get_config()
 2337
 2338        # Extract VCF
 2339        log.debug("Export VCF...")
 2340
 2341        # Table variants
 2342        table_variants = self.get_table_variants()
 2343
 2344        # Threads
 2345        if not threads:
 2346            threads = self.get_threads()
 2347
 2348        # Info fields
 2349        if remove_info:
 2350            if not isinstance(remove_info, str):
 2351                remove_info = "."
 2352            info_field = f"""'{remove_info}' as INFO"""
 2353        else:
 2354            info_field = "INFO"
 2355
 2356        # Samples fields
 2357        if add_samples:
 2358            if not list_samples:
 2359                list_samples = self.get_header_sample_list()
 2360            if list_samples:
 2361                samples_fields = " , FORMAT , " + " , ".join(list_samples)
 2362            else:
 2363                samples_fields = ""
 2364            log.debug(f"samples_fields: {samples_fields}")
 2365        else:
 2366            samples_fields = ""
 2367
 2368        # Where clause
 2369        if where_clause is None:
 2370            where_clause = ""
 2371
 2372        # Variants
 2373        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2374        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2375        log.debug(f"sql_query_select={sql_query_select}")
 2376
 2377        return self.export_output(
 2378            output_file=vcf_file,
 2379            output_header=None,
 2380            export_header=True,
 2381            query=sql_query_select,
 2382            parquet_partitions=None,
 2383            chunk_size=config.get("chunk_size", None),
 2384            threads=threads,
 2385            sort=True,
 2386            index=index,
 2387            order_by=None,
 2388        )
 2389
 2390    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2391        """
 2392        It takes a list of commands and runs them in parallel using the number of threads specified
 2393
 2394        :param commands: A list of commands to run
 2395        :param threads: The number of threads to use, defaults to 1 (optional)
 2396        """
 2397
 2398        run_parallel_commands(commands, threads)
 2399
 2400    def get_threads(self, default: int = 1) -> int:
 2401        """
 2402        This function returns the number of threads to use for a job, with a default value of 1 if not
 2403        specified.
 2404
 2405        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2406        default number of threads to use if no specific value is provided. If no value is provided for
 2407        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2408        used, defaults to 1
 2409        :type default: int (optional)
 2410        :return: the number of threads to use for the current job.
 2411        """
 2412
 2413        # Config
 2414        config = self.get_config()
 2415
 2416        # Param
 2417        param = self.get_param()
 2418
 2419        # Input threads
 2420        input_thread = param.get("threads", config.get("threads", None))
 2421
 2422        # Check threads
 2423        if not input_thread:
 2424            threads = default
 2425        elif int(input_thread) <= 0:
 2426            threads = os.cpu_count()
 2427        else:
 2428            threads = int(input_thread)
 2429        return threads
 2430
 2431    def get_memory(self, default: str = None) -> str:
 2432        """
 2433        This function retrieves the memory value from parameters or configuration with a default value
 2434        if not found.
 2435
 2436        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2437        default value is used as a fallback in case the `memory` parameter is not provided in the
 2438        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2439        the function
 2440        :type default: str
 2441        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2442        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2443        return the default value provided as an argument to the function.
 2444        """
 2445
 2446        # Config
 2447        config = self.get_config()
 2448
 2449        # Param
 2450        param = self.get_param()
 2451
 2452        # Input threads
 2453        input_memory = param.get("memory", config.get("memory", None))
 2454
 2455        # Check threads
 2456        if input_memory:
 2457            memory = input_memory
 2458        else:
 2459            memory = default
 2460
 2461        return memory
 2462
 2463    def update_from_vcf(self, vcf_file: str) -> None:
 2464        """
 2465        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2466
 2467        :param vcf_file: the path to the VCF file
 2468        """
 2469
 2470        connexion_format = self.get_connexion_format()
 2471
 2472        if connexion_format in ["duckdb"]:
 2473            self.update_from_vcf_duckdb(vcf_file)
 2474        elif connexion_format in ["sqlite"]:
 2475            self.update_from_vcf_sqlite(vcf_file)
 2476
 2477    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2478        """
 2479        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2480        INFO column of the VCF file
 2481
 2482        :param vcf_file: the path to the VCF file
 2483        """
 2484
 2485        # varaints table
 2486        table_variants = self.get_table_variants()
 2487
 2488        # Loading VCF into temporaire table
 2489        skip = self.get_header_length(file=vcf_file)
 2490        vcf_df = pd.read_csv(
 2491            vcf_file,
 2492            sep="\t",
 2493            engine="c",
 2494            skiprows=skip,
 2495            header=0,
 2496            low_memory=False,
 2497        )
 2498        sql_query_update = f"""
 2499        UPDATE {table_variants} as table_variants
 2500            SET INFO = concat(
 2501                            CASE
 2502                                WHEN INFO NOT IN ('', '.')
 2503                                THEN INFO
 2504                                ELSE ''
 2505                            END,
 2506                            (
 2507                                SELECT 
 2508                                    concat(
 2509                                        CASE
 2510                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2511                                            THEN ';'
 2512                                            ELSE ''
 2513                                        END
 2514                                        ,
 2515                                        CASE
 2516                                            WHEN table_parquet.INFO NOT IN ('','.')
 2517                                            THEN table_parquet.INFO
 2518                                            ELSE ''
 2519                                        END
 2520                                    )
 2521                                FROM vcf_df as table_parquet
 2522                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2523                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2524                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2525                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2526                                        AND table_parquet.INFO NOT IN ('','.')
 2527                            )
 2528                        )
 2529            ;
 2530            """
 2531        self.conn.execute(sql_query_update)
 2532
 2533    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2534        """
 2535        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2536        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2537        table
 2538
 2539        :param vcf_file: The path to the VCF file you want to update the database with
 2540        """
 2541
 2542        # Create a temporary table for the VCF
 2543        table_vcf = "tmp_vcf"
 2544        sql_create = (
 2545            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2546        )
 2547        self.conn.execute(sql_create)
 2548
 2549        # Loading VCF into temporaire table
 2550        vcf_df = pd.read_csv(
 2551            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2552        )
 2553        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2554        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2555
 2556        # Update table 'variants' with VCF data
 2557        # warning: CONCAT as || operator
 2558        sql_query_update = f"""
 2559            UPDATE variants as table_variants
 2560            SET INFO = CASE
 2561                            WHEN INFO NOT IN ('', '.')
 2562                            THEN INFO
 2563                            ELSE ''
 2564                        END ||
 2565                        (
 2566                        SELECT 
 2567                            CASE 
 2568                                WHEN table_variants.INFO NOT IN ('','.') 
 2569                                    AND table_vcf.INFO NOT IN ('','.')  
 2570                                THEN ';' 
 2571                                ELSE '' 
 2572                            END || 
 2573                            CASE 
 2574                                WHEN table_vcf.INFO NOT IN ('','.') 
 2575                                THEN table_vcf.INFO 
 2576                                ELSE '' 
 2577                            END
 2578                        FROM {table_vcf} as table_vcf
 2579                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2580                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2581                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2582                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2583                        )
 2584        """
 2585        self.conn.execute(sql_query_update)
 2586
 2587        # Drop temporary table
 2588        sql_drop = f"DROP TABLE {table_vcf}"
 2589        self.conn.execute(sql_drop)
 2590
 2591    def drop_variants_table(self) -> None:
 2592        """
 2593        > This function drops the variants table
 2594        """
 2595
 2596        table_variants = self.get_table_variants()
 2597        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2598        self.conn.execute(sql_table_variants)
 2599
 2600    def set_variant_id(
 2601        self, variant_id_column: str = "variant_id", force: bool = None
 2602    ) -> str:
 2603        """
 2604        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2605        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2606
 2607        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2608        to variant_id
 2609        :type variant_id_column: str (optional)
 2610        :param force: If True, the variant_id column will be created even if it already exists
 2611        :type force: bool
 2612        :return: The name of the column that contains the variant_id
 2613        """
 2614
 2615        # Assembly
 2616        assembly = self.get_param().get(
 2617            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2618        )
 2619
 2620        # INFO/Tag prefix
 2621        prefix = self.get_explode_infos_prefix()
 2622
 2623        # Explode INFO/SVTYPE
 2624        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2625
 2626        # variants table
 2627        table_variants = self.get_table_variants()
 2628
 2629        # variant_id column
 2630        if not variant_id_column:
 2631            variant_id_column = "variant_id"
 2632
 2633        # Creta variant_id column
 2634        if "variant_id" not in self.get_extra_infos() or force:
 2635
 2636            # Create column
 2637            self.add_column(
 2638                table_name=table_variants,
 2639                column_name=variant_id_column,
 2640                column_type="UBIGINT",
 2641                default_value="0",
 2642            )
 2643
 2644            # Update column
 2645            self.conn.execute(
 2646                f"""
 2647                    UPDATE {table_variants}
 2648                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2649                """
 2650            )
 2651
 2652        # Remove added columns
 2653        for added_column in added_columns:
 2654            self.drop_column(column=added_column)
 2655
 2656        # return variant_id column name
 2657        return variant_id_column
 2658
 2659    def get_variant_id_column(
 2660        self, variant_id_column: str = "variant_id", force: bool = None
 2661    ) -> str:
 2662        """
 2663        This function returns the variant_id column name
 2664
 2665        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2666        defaults to variant_id
 2667        :type variant_id_column: str (optional)
 2668        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2669        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2670        if it is not already set, or if it is set
 2671        :type force: bool
 2672        :return: The variant_id column name.
 2673        """
 2674
 2675        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2676
 2677    ###
 2678    # Annotation
 2679    ###
 2680
 2681    def scan_databases(
 2682        self,
 2683        database_formats: list = ["parquet"],
 2684        database_releases: list = ["current"],
 2685    ) -> dict:
 2686        """
 2687        The function `scan_databases` scans for available databases based on specified formats and
 2688        releases.
 2689
 2690        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2691        of the databases to be scanned. In this case, the accepted format is "parquet"
 2692        :type database_formats: list ["parquet"]
 2693        :param database_releases: The `database_releases` parameter is a list that specifies the
 2694        releases of the databases to be scanned. In the provided function, the default value for
 2695        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2696        databases that are in the "current"
 2697        :type database_releases: list
 2698        :return: The function `scan_databases` returns a dictionary containing information about
 2699        databases that match the specified formats and releases.
 2700        """
 2701
 2702        # Config
 2703        config = self.get_config()
 2704
 2705        # Param
 2706        param = self.get_param()
 2707
 2708        # Param - Assembly
 2709        assembly = param.get("assembly", config.get("assembly", None))
 2710        if not assembly:
 2711            assembly = DEFAULT_ASSEMBLY
 2712            log.warning(f"Default assembly '{assembly}'")
 2713
 2714        # Scan for availabled databases
 2715        log.info(
 2716            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2717        )
 2718        databases_infos_dict = databases_infos(
 2719            database_folder_releases=database_releases,
 2720            database_formats=database_formats,
 2721            assembly=assembly,
 2722            config=config,
 2723        )
 2724        log.info(
 2725            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2726        )
 2727
 2728        return databases_infos_dict
 2729
 2730    def annotation(self) -> None:
 2731        """
 2732        It annotates the VCF file with the annotations specified in the config file.
 2733        """
 2734
 2735        # Config
 2736        config = self.get_config()
 2737
 2738        # Param
 2739        param = self.get_param()
 2740
 2741        # Param - Assembly
 2742        assembly = param.get("assembly", config.get("assembly", None))
 2743        if not assembly:
 2744            assembly = DEFAULT_ASSEMBLY
 2745            log.warning(f"Default assembly '{assembly}'")
 2746
 2747        # annotations databases folders
 2748        annotations_databases = set(
 2749            config.get("folders", {})
 2750            .get("databases", {})
 2751            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2752            + config.get("folders", {})
 2753            .get("databases", {})
 2754            .get("parquet", ["~/howard/databases/parquet/current"])
 2755            + config.get("folders", {})
 2756            .get("databases", {})
 2757            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2758        )
 2759
 2760        # Get param annotations
 2761        if param.get("annotations", None) and isinstance(
 2762            param.get("annotations", None), str
 2763        ):
 2764            log.debug(param.get("annotations", None))
 2765            param_annotation_list = param.get("annotations").split(",")
 2766        else:
 2767            param_annotation_list = []
 2768
 2769        # Each tools param
 2770        if param.get("annotation_parquet", None) != None:
 2771            log.debug(
 2772                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2773            )
 2774            if isinstance(param.get("annotation_parquet", None), list):
 2775                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2776            else:
 2777                param_annotation_list.append(param.get("annotation_parquet"))
 2778        if param.get("annotation_snpsift", None) != None:
 2779            if isinstance(param.get("annotation_snpsift", None), list):
 2780                param_annotation_list.append(
 2781                    "snpsift:"
 2782                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2783                )
 2784            else:
 2785                param_annotation_list.append(
 2786                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2787                )
 2788        if param.get("annotation_snpeff", None) != None:
 2789            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2790        if param.get("annotation_bcftools", None) != None:
 2791            if isinstance(param.get("annotation_bcftools", None), list):
 2792                param_annotation_list.append(
 2793                    "bcftools:"
 2794                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2795                )
 2796            else:
 2797                param_annotation_list.append(
 2798                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2799                )
 2800        if param.get("annotation_annovar", None) != None:
 2801            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2802        if param.get("annotation_exomiser", None) != None:
 2803            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2804        if param.get("annotation_splice", None) != None:
 2805            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2806
 2807        # Merge param annotations list
 2808        param["annotations"] = ",".join(param_annotation_list)
 2809
 2810        # debug
 2811        log.debug(f"param_annotations={param['annotations']}")
 2812
 2813        if param.get("annotations"):
 2814
 2815            # Log
 2816            # log.info("Annotations - Check annotation parameters")
 2817
 2818            if not "annotation" in param:
 2819                param["annotation"] = {}
 2820
 2821            # List of annotations parameters
 2822            annotations_list_input = {}
 2823            if isinstance(param.get("annotations", None), str):
 2824                annotation_file_list = [
 2825                    value for value in param.get("annotations", "").split(",")
 2826                ]
 2827                for annotation_file in annotation_file_list:
 2828                    annotations_list_input[annotation_file] = {"INFO": None}
 2829            else:
 2830                annotations_list_input = param.get("annotations", {})
 2831
 2832            log.info(f"Quick Annotations:")
 2833            for annotation_key in list(annotations_list_input.keys()):
 2834                log.info(f"   {annotation_key}")
 2835
 2836            # List of annotations and associated fields
 2837            annotations_list = {}
 2838
 2839            for annotation_file in annotations_list_input:
 2840
 2841                # Explode annotations if ALL
 2842                if (
 2843                    annotation_file.upper() == "ALL"
 2844                    or annotation_file.upper().startswith("ALL:")
 2845                ):
 2846
 2847                    # check ALL parameters (formats, releases)
 2848                    annotation_file_split = annotation_file.split(":")
 2849                    database_formats = "parquet"
 2850                    database_releases = "current"
 2851                    for annotation_file_option in annotation_file_split[1:]:
 2852                        database_all_options_split = annotation_file_option.split("=")
 2853                        if database_all_options_split[0] == "format":
 2854                            database_formats = database_all_options_split[1].split("+")
 2855                        if database_all_options_split[0] == "release":
 2856                            database_releases = database_all_options_split[1].split("+")
 2857
 2858                    # Scan for availabled databases
 2859                    databases_infos_dict = self.scan_databases(
 2860                        database_formats=database_formats,
 2861                        database_releases=database_releases,
 2862                    )
 2863
 2864                    # Add found databases in annotation parameters
 2865                    for database_infos in databases_infos_dict.keys():
 2866                        annotations_list[database_infos] = {"INFO": None}
 2867
 2868                else:
 2869                    annotations_list[annotation_file] = annotations_list_input[
 2870                        annotation_file
 2871                    ]
 2872
 2873            # Check each databases
 2874            if len(annotations_list):
 2875
 2876                log.info(
 2877                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 2878                )
 2879
 2880                for annotation_file in annotations_list:
 2881
 2882                    # Init
 2883                    annotations = annotations_list.get(annotation_file, None)
 2884
 2885                    # Annotation snpEff
 2886                    if annotation_file.startswith("snpeff"):
 2887
 2888                        log.debug(f"Quick Annotation snpEff")
 2889
 2890                        if "snpeff" not in param["annotation"]:
 2891                            param["annotation"]["snpeff"] = {}
 2892
 2893                        if "options" not in param["annotation"]["snpeff"]:
 2894                            param["annotation"]["snpeff"]["options"] = ""
 2895
 2896                        # snpEff options in annotations
 2897                        param["annotation"]["snpeff"]["options"] = "".join(
 2898                            annotation_file.split(":")[1:]
 2899                        )
 2900
 2901                    # Annotation Annovar
 2902                    elif annotation_file.startswith("annovar"):
 2903
 2904                        log.debug(f"Quick Annotation Annovar")
 2905
 2906                        if "annovar" not in param["annotation"]:
 2907                            param["annotation"]["annovar"] = {}
 2908
 2909                        if "annotations" not in param["annotation"]["annovar"]:
 2910                            param["annotation"]["annovar"]["annotations"] = {}
 2911
 2912                        # Options
 2913                        annotation_file_split = annotation_file.split(":")
 2914                        for annotation_file_annotation in annotation_file_split[1:]:
 2915                            if annotation_file_annotation:
 2916                                param["annotation"]["annovar"]["annotations"][
 2917                                    annotation_file_annotation
 2918                                ] = annotations
 2919
 2920                    # Annotation Exomiser
 2921                    elif annotation_file.startswith("exomiser"):
 2922
 2923                        log.debug(f"Quick Annotation Exomiser")
 2924
 2925                        param["annotation"]["exomiser"] = params_string_to_dict(
 2926                            annotation_file
 2927                        )
 2928
 2929                    # Annotation Splice
 2930                    elif annotation_file.startswith("splice"):
 2931
 2932                        log.debug(f"Quick Annotation Splice")
 2933
 2934                        param["annotation"]["splice"] = params_string_to_dict(
 2935                            annotation_file
 2936                        )
 2937
 2938                    # Annotation Parquet or BCFTOOLS
 2939                    else:
 2940
 2941                        # Tools detection
 2942                        if annotation_file.startswith("bcftools:"):
 2943                            annotation_tool_initial = "bcftools"
 2944                            annotation_file = ":".join(annotation_file.split(":")[1:])
 2945                        elif annotation_file.startswith("snpsift:"):
 2946                            annotation_tool_initial = "snpsift"
 2947                            annotation_file = ":".join(annotation_file.split(":")[1:])
 2948                        else:
 2949                            annotation_tool_initial = None
 2950
 2951                        # list of files
 2952                        annotation_file_list = annotation_file.replace("+", ":").split(
 2953                            ":"
 2954                        )
 2955
 2956                        for annotation_file in annotation_file_list:
 2957
 2958                            if annotation_file:
 2959
 2960                                # Annotation tool initial
 2961                                annotation_tool = annotation_tool_initial
 2962
 2963                                # Find file
 2964                                annotation_file_found = None
 2965
 2966                                # Expand user
 2967                                annotation_file = full_path(annotation_file)
 2968
 2969                                if os.path.exists(annotation_file):
 2970                                    annotation_file_found = annotation_file
 2971
 2972                                else:
 2973                                    # Find within assembly folders
 2974                                    for annotations_database in annotations_databases:
 2975                                        found_files = find_all(
 2976                                            annotation_file,
 2977                                            os.path.join(
 2978                                                annotations_database, assembly
 2979                                            ),
 2980                                        )
 2981                                        if len(found_files) > 0:
 2982                                            annotation_file_found = found_files[0]
 2983                                            break
 2984                                    if not annotation_file_found and not assembly:
 2985                                        # Find within folders
 2986                                        for (
 2987                                            annotations_database
 2988                                        ) in annotations_databases:
 2989                                            found_files = find_all(
 2990                                                annotation_file, annotations_database
 2991                                            )
 2992                                            if len(found_files) > 0:
 2993                                                annotation_file_found = found_files[0]
 2994                                                break
 2995                                log.debug(
 2996                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 2997                                )
 2998
 2999                                # Full path
 3000                                annotation_file_found = full_path(annotation_file_found)
 3001
 3002                                if annotation_file_found:
 3003
 3004                                    database = Database(database=annotation_file_found)
 3005                                    quick_annotation_format = database.get_format()
 3006                                    quick_annotation_is_compressed = (
 3007                                        database.is_compressed()
 3008                                    )
 3009                                    quick_annotation_is_indexed = os.path.exists(
 3010                                        f"{annotation_file_found}.tbi"
 3011                                    )
 3012                                    bcftools_preference = False
 3013
 3014                                    # Check Annotation Tool
 3015                                    if not annotation_tool:
 3016                                        if (
 3017                                            bcftools_preference
 3018                                            and quick_annotation_format
 3019                                            in ["vcf", "bed"]
 3020                                            and quick_annotation_is_compressed
 3021                                            and quick_annotation_is_indexed
 3022                                        ):
 3023                                            annotation_tool = "bcftools"
 3024                                        elif quick_annotation_format in [
 3025                                            "vcf",
 3026                                            "bed",
 3027                                            "tsv",
 3028                                            "tsv",
 3029                                            "csv",
 3030                                            "json",
 3031                                            "tbl",
 3032                                            "parquet",
 3033                                            "duckdb",
 3034                                        ]:
 3035                                            annotation_tool = "parquet"
 3036                                        else:
 3037                                            log.error(
 3038                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3039                                            )
 3040                                            raise ValueError(
 3041                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3042                                            )
 3043
 3044                                    log.debug(
 3045                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3046                                    )
 3047
 3048                                    # Annotation Tool dispatch
 3049                                    if annotation_tool:
 3050                                        if annotation_tool not in param["annotation"]:
 3051                                            param["annotation"][annotation_tool] = {}
 3052                                        if (
 3053                                            "annotations"
 3054                                            not in param["annotation"][annotation_tool]
 3055                                        ):
 3056                                            param["annotation"][annotation_tool][
 3057                                                "annotations"
 3058                                            ] = {}
 3059                                        param["annotation"][annotation_tool][
 3060                                            "annotations"
 3061                                        ][annotation_file_found] = annotations
 3062
 3063                                else:
 3064                                    log.error(
 3065                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3066                                    )
 3067
 3068                self.set_param(param)
 3069
 3070        if param.get("annotation", None):
 3071            log.info("Annotations")
 3072            if param.get("annotation", {}).get("parquet", None):
 3073                log.info("Annotations 'parquet'...")
 3074                self.annotation_parquet()
 3075            if param.get("annotation", {}).get("bcftools", None):
 3076                log.info("Annotations 'bcftools'...")
 3077                self.annotation_bcftools()
 3078            if param.get("annotation", {}).get("snpsift", None):
 3079                log.info("Annotations 'snpsift'...")
 3080                self.annotation_snpsift()
 3081            if param.get("annotation", {}).get("annovar", None):
 3082                log.info("Annotations 'annovar'...")
 3083                self.annotation_annovar()
 3084            if param.get("annotation", {}).get("snpeff", None):
 3085                log.info("Annotations 'snpeff'...")
 3086                self.annotation_snpeff()
 3087            if param.get("annotation", {}).get("exomiser", None) is not None:
 3088                log.info("Annotations 'exomiser'...")
 3089                self.annotation_exomiser()
 3090            if param.get("annotation", {}).get("splice", None) is not None:
 3091                log.info("Annotations 'splice' ...")
 3092                self.annotation_splice()
 3093
 3094        # Explode INFOS fields into table fields
 3095        if self.get_explode_infos():
 3096            self.explode_infos(
 3097                prefix=self.get_explode_infos_prefix(),
 3098                fields=self.get_explode_infos_fields(),
 3099                force=True,
 3100            )
 3101
 3102    def annotation_snpsift(self, threads: int = None) -> None:
 3103        """
 3104        This function annotate with bcftools
 3105
 3106        :param threads: Number of threads to use
 3107        :return: the value of the variable "return_value".
 3108        """
 3109
 3110        # DEBUG
 3111        log.debug("Start annotation with bcftools databases")
 3112
 3113        # Threads
 3114        if not threads:
 3115            threads = self.get_threads()
 3116        log.debug("Threads: " + str(threads))
 3117
 3118        # Config
 3119        config = self.get_config()
 3120        log.debug("Config: " + str(config))
 3121
 3122        # Config - snpSift
 3123        snpsift_bin_command = get_bin_command(
 3124            bin="SnpSift.jar",
 3125            tool="snpsift",
 3126            bin_type="jar",
 3127            config=config,
 3128            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3129        )
 3130        if not snpsift_bin_command:
 3131            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3132            log.error(msg_err)
 3133            raise ValueError(msg_err)
 3134
 3135        # Config - bcftools
 3136        bcftools_bin_command = get_bin_command(
 3137            bin="bcftools",
 3138            tool="bcftools",
 3139            bin_type="bin",
 3140            config=config,
 3141            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3142        )
 3143        if not bcftools_bin_command:
 3144            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3145            log.error(msg_err)
 3146            raise ValueError(msg_err)
 3147
 3148        # Config - BCFTools databases folders
 3149        databases_folders = set(
 3150            self.get_config()
 3151            .get("folders", {})
 3152            .get("databases", {})
 3153            .get("annotations", ["."])
 3154            + self.get_config()
 3155            .get("folders", {})
 3156            .get("databases", {})
 3157            .get("bcftools", ["."])
 3158        )
 3159        log.debug("Databases annotations: " + str(databases_folders))
 3160
 3161        # Param
 3162        annotations = (
 3163            self.get_param()
 3164            .get("annotation", {})
 3165            .get("snpsift", {})
 3166            .get("annotations", None)
 3167        )
 3168        log.debug("Annotations: " + str(annotations))
 3169
 3170        # Assembly
 3171        assembly = self.get_param().get(
 3172            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3173        )
 3174
 3175        # Data
 3176        table_variants = self.get_table_variants()
 3177
 3178        # Check if not empty
 3179        log.debug("Check if not empty")
 3180        sql_query_chromosomes = (
 3181            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3182        )
 3183        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3184        if not sql_query_chromosomes_df["count"][0]:
 3185            log.info(f"VCF empty")
 3186            return
 3187
 3188        # VCF header
 3189        vcf_reader = self.get_header()
 3190        log.debug("Initial header: " + str(vcf_reader.infos))
 3191
 3192        # Existing annotations
 3193        for vcf_annotation in self.get_header().infos:
 3194
 3195            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3196            log.debug(
 3197                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3198            )
 3199
 3200        if annotations:
 3201
 3202            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3203
 3204                # Export VCF file
 3205                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3206
 3207                # Init
 3208                commands = {}
 3209
 3210                for annotation in annotations:
 3211                    annotation_fields = annotations[annotation]
 3212
 3213                    # Annotation Name
 3214                    annotation_name = os.path.basename(annotation)
 3215
 3216                    if not annotation_fields:
 3217                        annotation_fields = {"INFO": None}
 3218
 3219                    log.debug(f"Annotation '{annotation_name}'")
 3220                    log.debug(
 3221                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3222                    )
 3223
 3224                    # Create Database
 3225                    database = Database(
 3226                        database=annotation,
 3227                        databases_folders=databases_folders,
 3228                        assembly=assembly,
 3229                    )
 3230
 3231                    # Find files
 3232                    db_file = database.get_database()
 3233                    db_file = full_path(db_file)
 3234                    db_hdr_file = database.get_header_file()
 3235                    db_hdr_file = full_path(db_hdr_file)
 3236                    db_file_type = database.get_format()
 3237                    db_tbi_file = f"{db_file}.tbi"
 3238                    db_file_compressed = database.is_compressed()
 3239
 3240                    # Check if compressed
 3241                    if not db_file_compressed:
 3242                        log.error(
 3243                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3244                        )
 3245                        raise ValueError(
 3246                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3247                        )
 3248
 3249                    # Check if indexed
 3250                    if not os.path.exists(db_tbi_file):
 3251                        log.error(
 3252                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3253                        )
 3254                        raise ValueError(
 3255                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3256                        )
 3257
 3258                    # Check index - try to create if not exists
 3259                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3260                        log.error("Annotation failed: database not valid")
 3261                        log.error(f"Annotation annotation file: {db_file}")
 3262                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3263                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3264                        raise ValueError(
 3265                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3266                        )
 3267                    else:
 3268
 3269                        log.debug(
 3270                            f"Annotation '{annotation}' - file: "
 3271                            + str(db_file)
 3272                            + " and "
 3273                            + str(db_hdr_file)
 3274                        )
 3275
 3276                        # Load header as VCF object
 3277                        db_hdr_vcf = Variants(input=db_hdr_file)
 3278                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3279                        log.debug(
 3280                            "Annotation database header: "
 3281                            + str(db_hdr_vcf_header_infos)
 3282                        )
 3283
 3284                        # For all fields in database
 3285                        annotation_fields_full = False
 3286                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3287                            annotation_fields = {
 3288                                key: key for key in db_hdr_vcf_header_infos
 3289                            }
 3290                            log.debug(
 3291                                "Annotation database header - All annotations added: "
 3292                                + str(annotation_fields)
 3293                            )
 3294                            annotation_fields_full = True
 3295
 3296                        # # Create file for field rename
 3297                        # log.debug("Create file for field rename")
 3298                        # tmp_rename = NamedTemporaryFile(
 3299                        #     prefix=self.get_prefix(),
 3300                        #     dir=self.get_tmp_dir(),
 3301                        #     suffix=".rename",
 3302                        #     delete=False,
 3303                        # )
 3304                        # tmp_rename_name = tmp_rename.name
 3305                        # tmp_files.append(tmp_rename_name)
 3306
 3307                        # Number of fields
 3308                        nb_annotation_field = 0
 3309                        annotation_list = []
 3310                        annotation_infos_rename_list = []
 3311
 3312                        for annotation_field in annotation_fields:
 3313
 3314                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3315                            annotation_fields_new_name = annotation_fields.get(
 3316                                annotation_field, annotation_field
 3317                            )
 3318                            if not annotation_fields_new_name:
 3319                                annotation_fields_new_name = annotation_field
 3320
 3321                            # Check if field is in DB and if field is not elready in input data
 3322                            if (
 3323                                annotation_field in db_hdr_vcf.get_header().infos
 3324                                and annotation_fields_new_name
 3325                                not in self.get_header().infos
 3326                            ):
 3327
 3328                                log.info(
 3329                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3330                                )
 3331
 3332                                # BCFTools annotate param to rename fields
 3333                                if annotation_field != annotation_fields_new_name:
 3334                                    annotation_infos_rename_list.append(
 3335                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3336                                    )
 3337
 3338                                # Add INFO field to header
 3339                                db_hdr_vcf_header_infos_number = (
 3340                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3341                                )
 3342                                db_hdr_vcf_header_infos_type = (
 3343                                    db_hdr_vcf_header_infos[annotation_field].type
 3344                                    or "String"
 3345                                )
 3346                                db_hdr_vcf_header_infos_description = (
 3347                                    db_hdr_vcf_header_infos[annotation_field].desc
 3348                                    or f"{annotation_field} description"
 3349                                )
 3350                                db_hdr_vcf_header_infos_source = (
 3351                                    db_hdr_vcf_header_infos[annotation_field].source
 3352                                    or "unknown"
 3353                                )
 3354                                db_hdr_vcf_header_infos_version = (
 3355                                    db_hdr_vcf_header_infos[annotation_field].version
 3356                                    or "unknown"
 3357                                )
 3358
 3359                                vcf_reader.infos[annotation_fields_new_name] = (
 3360                                    vcf.parser._Info(
 3361                                        annotation_fields_new_name,
 3362                                        db_hdr_vcf_header_infos_number,
 3363                                        db_hdr_vcf_header_infos_type,
 3364                                        db_hdr_vcf_header_infos_description,
 3365                                        db_hdr_vcf_header_infos_source,
 3366                                        db_hdr_vcf_header_infos_version,
 3367                                        self.code_type_map[
 3368                                            db_hdr_vcf_header_infos_type
 3369                                        ],
 3370                                    )
 3371                                )
 3372
 3373                                annotation_list.append(annotation_field)
 3374
 3375                                nb_annotation_field += 1
 3376
 3377                            else:
 3378
 3379                                if (
 3380                                    annotation_field
 3381                                    not in db_hdr_vcf.get_header().infos
 3382                                ):
 3383                                    log.warning(
 3384                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3385                                    )
 3386                                if (
 3387                                    annotation_fields_new_name
 3388                                    in self.get_header().infos
 3389                                ):
 3390                                    log.warning(
 3391                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3392                                    )
 3393
 3394                        log.info(
 3395                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3396                        )
 3397
 3398                        annotation_infos = ",".join(annotation_list)
 3399
 3400                        if annotation_infos != "":
 3401
 3402                            # Annotated VCF (and error file)
 3403                            tmp_annotation_vcf_name = os.path.join(
 3404                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3405                            )
 3406                            tmp_annotation_vcf_name_err = (
 3407                                tmp_annotation_vcf_name + ".err"
 3408                            )
 3409
 3410                            # Add fields to annotate
 3411                            if not annotation_fields_full:
 3412                                annotation_infos_option = f"-info {annotation_infos}"
 3413                            else:
 3414                                annotation_infos_option = ""
 3415
 3416                            # Info fields rename
 3417                            if annotation_infos_rename_list:
 3418                                annotation_infos_rename = " -c " + ",".join(
 3419                                    annotation_infos_rename_list
 3420                                )
 3421                            else:
 3422                                annotation_infos_rename = ""
 3423
 3424                            # Annotate command
 3425                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3426
 3427                            # Add command
 3428                            commands[command_annotate] = tmp_annotation_vcf_name
 3429
 3430                if commands:
 3431
 3432                    # Export VCF file
 3433                    self.export_variant_vcf(
 3434                        vcf_file=tmp_vcf_name,
 3435                        remove_info=True,
 3436                        add_samples=False,
 3437                        index=True,
 3438                    )
 3439                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3440
 3441                    # Num command
 3442                    nb_command = 0
 3443
 3444                    # Annotate
 3445                    for command_annotate in commands:
 3446                        nb_command += 1
 3447                        log.info(
 3448                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3449                        )
 3450                        log.debug(f"command_annotate={command_annotate}")
 3451                        run_parallel_commands([command_annotate], threads)
 3452
 3453                        # Debug
 3454                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3455
 3456                        # Update variants
 3457                        log.info(
 3458                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3459                        )
 3460                        self.update_from_vcf(commands[command_annotate])
 3461
 3462    def annotation_bcftools(self, threads: int = None) -> None:
 3463        """
 3464        This function annotate with bcftools
 3465
 3466        :param threads: Number of threads to use
 3467        :return: the value of the variable "return_value".
 3468        """
 3469
 3470        # DEBUG
 3471        log.debug("Start annotation with bcftools databases")
 3472
 3473        # Threads
 3474        if not threads:
 3475            threads = self.get_threads()
 3476        log.debug("Threads: " + str(threads))
 3477
 3478        # Config
 3479        config = self.get_config()
 3480        log.debug("Config: " + str(config))
 3481
 3482        # DEBUG
 3483        delete_tmp = True
 3484        if self.get_config().get("verbosity", "warning") in ["debug"]:
 3485            delete_tmp = False
 3486            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 3487
 3488        # Config - BCFTools bin command
 3489        bcftools_bin_command = get_bin_command(
 3490            bin="bcftools",
 3491            tool="bcftools",
 3492            bin_type="bin",
 3493            config=config,
 3494            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3495        )
 3496        if not bcftools_bin_command:
 3497            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3498            log.error(msg_err)
 3499            raise ValueError(msg_err)
 3500
 3501        # Config - BCFTools databases folders
 3502        databases_folders = set(
 3503            self.get_config()
 3504            .get("folders", {})
 3505            .get("databases", {})
 3506            .get("annotations", ["."])
 3507            + self.get_config()
 3508            .get("folders", {})
 3509            .get("databases", {})
 3510            .get("bcftools", ["."])
 3511        )
 3512        log.debug("Databases annotations: " + str(databases_folders))
 3513
 3514        # Param
 3515        annotations = (
 3516            self.get_param()
 3517            .get("annotation", {})
 3518            .get("bcftools", {})
 3519            .get("annotations", None)
 3520        )
 3521        log.debug("Annotations: " + str(annotations))
 3522
 3523        # Assembly
 3524        assembly = self.get_param().get(
 3525            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3526        )
 3527
 3528        # Data
 3529        table_variants = self.get_table_variants()
 3530
 3531        # Check if not empty
 3532        log.debug("Check if not empty")
 3533        sql_query_chromosomes = (
 3534            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3535        )
 3536        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3537        if not sql_query_chromosomes_df["count"][0]:
 3538            log.info(f"VCF empty")
 3539            return
 3540
 3541        # Export in VCF
 3542        log.debug("Create initial file to annotate")
 3543        tmp_vcf = NamedTemporaryFile(
 3544            prefix=self.get_prefix(),
 3545            dir=self.get_tmp_dir(),
 3546            suffix=".vcf.gz",
 3547            delete=False,
 3548        )
 3549        tmp_vcf_name = tmp_vcf.name
 3550
 3551        # VCF header
 3552        vcf_reader = self.get_header()
 3553        log.debug("Initial header: " + str(vcf_reader.infos))
 3554
 3555        # Existing annotations
 3556        for vcf_annotation in self.get_header().infos:
 3557
 3558            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3559            log.debug(
 3560                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3561            )
 3562
 3563        if annotations:
 3564
 3565            tmp_ann_vcf_list = []
 3566            commands = []
 3567            tmp_files = []
 3568            err_files = []
 3569
 3570            for annotation in annotations:
 3571                annotation_fields = annotations[annotation]
 3572
 3573                # Annotation Name
 3574                annotation_name = os.path.basename(annotation)
 3575
 3576                if not annotation_fields:
 3577                    annotation_fields = {"INFO": None}
 3578
 3579                log.debug(f"Annotation '{annotation_name}'")
 3580                log.debug(
 3581                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3582                )
 3583
 3584                # Create Database
 3585                database = Database(
 3586                    database=annotation,
 3587                    databases_folders=databases_folders,
 3588                    assembly=assembly,
 3589                )
 3590
 3591                # Find files
 3592                db_file = database.get_database()
 3593                db_file = full_path(db_file)
 3594                db_hdr_file = database.get_header_file()
 3595                db_hdr_file = full_path(db_hdr_file)
 3596                db_file_type = database.get_format()
 3597                db_tbi_file = f"{db_file}.tbi"
 3598                db_file_compressed = database.is_compressed()
 3599
 3600                # Check if compressed
 3601                if not db_file_compressed:
 3602                    log.error(
 3603                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3604                    )
 3605                    raise ValueError(
 3606                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3607                    )
 3608
 3609                # Check if indexed
 3610                if not os.path.exists(db_tbi_file):
 3611                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 3612                    raise ValueError(
 3613                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3614                    )
 3615
 3616                # Check index - try to create if not exists
 3617                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3618                    log.error("Annotation failed: database not valid")
 3619                    log.error(f"Annotation annotation file: {db_file}")
 3620                    log.error(f"Annotation annotation header: {db_hdr_file}")
 3621                    log.error(f"Annotation annotation index: {db_tbi_file}")
 3622                    raise ValueError(
 3623                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3624                    )
 3625                else:
 3626
 3627                    log.debug(
 3628                        f"Annotation '{annotation}' - file: "
 3629                        + str(db_file)
 3630                        + " and "
 3631                        + str(db_hdr_file)
 3632                    )
 3633
 3634                    # Load header as VCF object
 3635                    db_hdr_vcf = Variants(input=db_hdr_file)
 3636                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3637                    log.debug(
 3638                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 3639                    )
 3640
 3641                    # For all fields in database
 3642                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3643                        annotation_fields = {
 3644                            key: key for key in db_hdr_vcf_header_infos
 3645                        }
 3646                        log.debug(
 3647                            "Annotation database header - All annotations added: "
 3648                            + str(annotation_fields)
 3649                        )
 3650
 3651                    # Number of fields
 3652                    nb_annotation_field = 0
 3653                    annotation_list = []
 3654
 3655                    for annotation_field in annotation_fields:
 3656
 3657                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3658                        annotation_fields_new_name = annotation_fields.get(
 3659                            annotation_field, annotation_field
 3660                        )
 3661                        if not annotation_fields_new_name:
 3662                            annotation_fields_new_name = annotation_field
 3663
 3664                        # Check if field is in DB and if field is not elready in input data
 3665                        if (
 3666                            annotation_field in db_hdr_vcf.get_header().infos
 3667                            and annotation_fields_new_name
 3668                            not in self.get_header().infos
 3669                        ):
 3670
 3671                            log.info(
 3672                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3673                            )
 3674
 3675                            # Add INFO field to header
 3676                            db_hdr_vcf_header_infos_number = (
 3677                                db_hdr_vcf_header_infos[annotation_field].num or "."
 3678                            )
 3679                            db_hdr_vcf_header_infos_type = (
 3680                                db_hdr_vcf_header_infos[annotation_field].type
 3681                                or "String"
 3682                            )
 3683                            db_hdr_vcf_header_infos_description = (
 3684                                db_hdr_vcf_header_infos[annotation_field].desc
 3685                                or f"{annotation_field} description"
 3686                            )
 3687                            db_hdr_vcf_header_infos_source = (
 3688                                db_hdr_vcf_header_infos[annotation_field].source
 3689                                or "unknown"
 3690                            )
 3691                            db_hdr_vcf_header_infos_version = (
 3692                                db_hdr_vcf_header_infos[annotation_field].version
 3693                                or "unknown"
 3694                            )
 3695
 3696                            vcf_reader.infos[annotation_fields_new_name] = (
 3697                                vcf.parser._Info(
 3698                                    annotation_fields_new_name,
 3699                                    db_hdr_vcf_header_infos_number,
 3700                                    db_hdr_vcf_header_infos_type,
 3701                                    db_hdr_vcf_header_infos_description,
 3702                                    db_hdr_vcf_header_infos_source,
 3703                                    db_hdr_vcf_header_infos_version,
 3704                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 3705                                )
 3706                            )
 3707
 3708                            # annotation_list.append(annotation_field)
 3709                            if annotation_field != annotation_fields_new_name:
 3710                                annotation_list.append(
 3711                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3712                                )
 3713                            else:
 3714                                annotation_list.append(annotation_field)
 3715
 3716                            nb_annotation_field += 1
 3717
 3718                        else:
 3719
 3720                            if annotation_field not in db_hdr_vcf.get_header().infos:
 3721                                log.warning(
 3722                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 3723                                )
 3724                            if annotation_fields_new_name in self.get_header().infos:
 3725                                log.warning(
 3726                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3727                                )
 3728
 3729                    log.info(
 3730                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3731                    )
 3732
 3733                    annotation_infos = ",".join(annotation_list)
 3734
 3735                    if annotation_infos != "":
 3736
 3737                        # Protect header for bcftools (remove "#CHROM" and variants line)
 3738                        log.debug("Protect Header file - remove #CHROM line if exists")
 3739                        tmp_header_vcf = NamedTemporaryFile(
 3740                            prefix=self.get_prefix(),
 3741                            dir=self.get_tmp_dir(),
 3742                            suffix=".hdr",
 3743                            delete=False,
 3744                        )
 3745                        tmp_header_vcf_name = tmp_header_vcf.name
 3746                        tmp_files.append(tmp_header_vcf_name)
 3747                        # Command
 3748                        if db_hdr_file.endswith(".gz"):
 3749                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 3750                        else:
 3751                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 3752                        # Run
 3753                        run_parallel_commands([command_extract_header], 1)
 3754
 3755                        # Find chomosomes
 3756                        log.debug("Find chromosomes ")
 3757                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 3758                        sql_query_chromosomes_df = self.get_query_to_df(
 3759                            sql_query_chromosomes
 3760                        )
 3761                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 3762
 3763                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 3764
 3765                        # BED columns in the annotation file
 3766                        if db_file_type in ["bed"]:
 3767                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 3768
 3769                        for chrom in chomosomes_list:
 3770
 3771                            # Create BED on initial VCF
 3772                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 3773                            tmp_bed = NamedTemporaryFile(
 3774                                prefix=self.get_prefix(),
 3775                                dir=self.get_tmp_dir(),
 3776                                suffix=".bed",
 3777                                delete=False,
 3778                            )
 3779                            tmp_bed_name = tmp_bed.name
 3780                            tmp_files.append(tmp_bed_name)
 3781
 3782                            # Detecte regions
 3783                            log.debug(
 3784                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 3785                            )
 3786                            window = 1000000
 3787                            sql_query_intervals_for_bed = f"""
 3788                                SELECT  \"#CHROM\",
 3789                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 3790                                        \"POS\"+{window}
 3791                                FROM {table_variants} as table_variants
 3792                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 3793                            """
 3794                            regions = self.conn.execute(
 3795                                sql_query_intervals_for_bed
 3796                            ).fetchall()
 3797                            merged_regions = merge_regions(regions)
 3798                            log.debug(
 3799                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 3800                            )
 3801
 3802                            header = ["#CHROM", "START", "END"]
 3803                            with open(tmp_bed_name, "w") as f:
 3804                                # Write the header with tab delimiter
 3805                                f.write("\t".join(header) + "\n")
 3806                                for d in merged_regions:
 3807                                    # Write each data row with tab delimiter
 3808                                    f.write("\t".join(map(str, d)) + "\n")
 3809
 3810                            # Tmp files
 3811                            tmp_annotation_vcf = NamedTemporaryFile(
 3812                                prefix=self.get_prefix(),
 3813                                dir=self.get_tmp_dir(),
 3814                                suffix=".vcf.gz",
 3815                                delete=False,
 3816                            )
 3817                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 3818                            tmp_files.append(tmp_annotation_vcf_name)
 3819                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 3820                            tmp_annotation_vcf_name_err = (
 3821                                tmp_annotation_vcf_name + ".err"
 3822                            )
 3823                            err_files.append(tmp_annotation_vcf_name_err)
 3824
 3825                            # Annotate Command
 3826                            log.debug(
 3827                                f"Annotation '{annotation}' - add bcftools command"
 3828                            )
 3829
 3830                            # Command
 3831                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3832
 3833                            # Add command
 3834                            commands.append(command_annotate)
 3835
 3836            # if some commands
 3837            if commands:
 3838
 3839                # Export VCF file
 3840                self.export_variant_vcf(
 3841                    vcf_file=tmp_vcf_name,
 3842                    remove_info=True,
 3843                    add_samples=False,
 3844                    index=True,
 3845                )
 3846
 3847                # Threads
 3848                # calculate threads for annotated commands
 3849                if commands:
 3850                    threads_bcftools_annotate = round(threads / len(commands))
 3851                else:
 3852                    threads_bcftools_annotate = 1
 3853
 3854                if not threads_bcftools_annotate:
 3855                    threads_bcftools_annotate = 1
 3856
 3857                # Add threads option to bcftools commands
 3858                if threads_bcftools_annotate > 1:
 3859                    commands_threaded = []
 3860                    for command in commands:
 3861                        commands_threaded.append(
 3862                            command.replace(
 3863                                f"{bcftools_bin_command} annotate ",
 3864                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 3865                            )
 3866                        )
 3867                    commands = commands_threaded
 3868
 3869                # Command annotation multithreading
 3870                log.debug(f"Annotation - Annotation commands: " + str(commands))
 3871                log.info(
 3872                    f"Annotation - Annotation multithreaded in "
 3873                    + str(len(commands))
 3874                    + " commands"
 3875                )
 3876
 3877                run_parallel_commands(commands, threads)
 3878
 3879                # Merge
 3880                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 3881
 3882                if tmp_ann_vcf_list_cmd:
 3883
 3884                    # Tmp file
 3885                    tmp_annotate_vcf = NamedTemporaryFile(
 3886                        prefix=self.get_prefix(),
 3887                        dir=self.get_tmp_dir(),
 3888                        suffix=".vcf.gz",
 3889                        delete=True,
 3890                    )
 3891                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 3892                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 3893                    err_files.append(tmp_annotate_vcf_name_err)
 3894
 3895                    # Tmp file remove command
 3896                    tmp_files_remove_command = ""
 3897                    if tmp_files:
 3898                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 3899
 3900                    # Command merge
 3901                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 3902                    log.info(
 3903                        f"Annotation - Annotation merging "
 3904                        + str(len(commands))
 3905                        + " annotated files"
 3906                    )
 3907                    log.debug(f"Annotation - merge command: {merge_command}")
 3908                    run_parallel_commands([merge_command], 1)
 3909
 3910                    # Error messages
 3911                    log.info(f"Error/Warning messages:")
 3912                    error_message_command_all = []
 3913                    error_message_command_warning = []
 3914                    error_message_command_err = []
 3915                    for err_file in err_files:
 3916                        with open(err_file, "r") as f:
 3917                            for line in f:
 3918                                message = line.strip()
 3919                                error_message_command_all.append(message)
 3920                                if line.startswith("[W::"):
 3921                                    error_message_command_warning.append(message)
 3922                                if line.startswith("[E::"):
 3923                                    error_message_command_err.append(
 3924                                        f"{err_file}: " + message
 3925                                    )
 3926                    # log info
 3927                    for message in list(
 3928                        set(error_message_command_err + error_message_command_warning)
 3929                    ):
 3930                        log.info(f"   {message}")
 3931                    # debug info
 3932                    for message in list(set(error_message_command_all)):
 3933                        log.debug(f"   {message}")
 3934                    # failed
 3935                    if len(error_message_command_err):
 3936                        log.error("Annotation failed: Error in commands")
 3937                        raise ValueError("Annotation failed: Error in commands")
 3938
 3939                    # Update variants
 3940                    log.info(f"Annotation - Updating...")
 3941                    self.update_from_vcf(tmp_annotate_vcf_name)
 3942
 3943    def annotation_exomiser(self, threads: int = None) -> None:
 3944        """
 3945        This function annotate with Exomiser
 3946
 3947        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 3948        - "analysis" (dict/file):
 3949            Full analysis dictionnary parameters (see Exomiser docs).
 3950            Either a dict, or a file in JSON or YAML format.
 3951            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 3952            Default : None
 3953        - "preset" (string):
 3954            Analysis preset (available in config folder).
 3955            Used if no full "analysis" is provided.
 3956            Default: "exome"
 3957        - "phenopacket" (dict/file):
 3958            Samples and phenotipic features parameters (see Exomiser docs).
 3959            Either a dict, or a file in JSON or YAML format.
 3960            Default: None
 3961        - "subject" (dict):
 3962            Sample parameters (see Exomiser docs).
 3963            Example:
 3964                "subject":
 3965                    {
 3966                        "id": "ISDBM322017",
 3967                        "sex": "FEMALE"
 3968                    }
 3969            Default: None
 3970        - "sample" (string):
 3971            Sample name to construct "subject" section:
 3972                "subject":
 3973                    {
 3974                        "id": "<sample>",
 3975                        "sex": "UNKNOWN_SEX"
 3976                    }
 3977            Default: None
 3978        - "phenotypicFeatures" (dict)
 3979            Phenotypic features to construct "subject" section.
 3980            Example:
 3981                "phenotypicFeatures":
 3982                    [
 3983                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 3984                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 3985                    ]
 3986        - "hpo" (list)
 3987            List of HPO ids as phenotypic features.
 3988            Example:
 3989                "hpo": ['0001156', '0001363', '0011304', '0010055']
 3990            Default: []
 3991        - "outputOptions" (dict):
 3992            Output options (see Exomiser docs).
 3993            Default:
 3994                "output_options" =
 3995                    {
 3996                        "outputContributingVariantsOnly": False,
 3997                        "numGenes": 0,
 3998                        "outputFormats": ["TSV_VARIANT", "VCF"]
 3999                    }
 4000        - "transcript_source" (string):
 4001            Transcript source (either "refseq", "ucsc", "ensembl")
 4002            Default: "refseq"
 4003        - "exomiser_to_info" (boolean):
 4004            Add exomiser TSV file columns as INFO fields in VCF.
 4005            Default: False
 4006        - "release" (string):
 4007            Exomise database release.
 4008            If not exists, database release will be downloaded (take a while).
 4009            Default: None (provided by application.properties configuration file)
 4010        - "exomiser_application_properties" (file):
 4011            Exomiser configuration file (see Exomiser docs).
 4012            Useful to automatically download databases (especially for specific genome databases).
 4013
 4014        Notes:
 4015        - If no sample in parameters, first sample in VCF will be chosen
 4016        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4017
 4018        :param threads: The number of threads to use
 4019        :return: None.
 4020        """
 4021
 4022        # DEBUG
 4023        log.debug("Start annotation with Exomiser databases")
 4024
 4025        # Threads
 4026        if not threads:
 4027            threads = self.get_threads()
 4028        log.debug("Threads: " + str(threads))
 4029
 4030        # Config
 4031        config = self.get_config()
 4032        log.debug("Config: " + str(config))
 4033
 4034        # Config - Folders - Databases
 4035        databases_folders = (
 4036            config.get("folders", {})
 4037            .get("databases", {})
 4038            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4039        )
 4040        databases_folders = full_path(databases_folders)
 4041        if not os.path.exists(databases_folders):
 4042            log.error(f"Databases annotations: {databases_folders} NOT found")
 4043        log.debug("Databases annotations: " + str(databases_folders))
 4044
 4045        # Config - Exomiser
 4046        exomiser_bin_command = get_bin_command(
 4047            bin="exomiser-cli*.jar",
 4048            tool="exomiser",
 4049            bin_type="jar",
 4050            config=config,
 4051            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4052        )
 4053        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4054        if not exomiser_bin_command:
 4055            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4056            log.error(msg_err)
 4057            raise ValueError(msg_err)
 4058
 4059        # Param
 4060        param = self.get_param()
 4061        log.debug("Param: " + str(param))
 4062
 4063        # Param - Exomiser
 4064        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4065        log.debug(f"Param Exomiser: {param_exomiser}")
 4066
 4067        # Param - Assembly
 4068        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4069        log.debug("Assembly: " + str(assembly))
 4070
 4071        # Data
 4072        table_variants = self.get_table_variants()
 4073
 4074        # Check if not empty
 4075        log.debug("Check if not empty")
 4076        sql_query_chromosomes = (
 4077            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4078        )
 4079        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4080            log.info(f"VCF empty")
 4081            return False
 4082
 4083        # VCF header
 4084        vcf_reader = self.get_header()
 4085        log.debug("Initial header: " + str(vcf_reader.infos))
 4086
 4087        # Samples
 4088        samples = self.get_header_sample_list()
 4089        if not samples:
 4090            log.error("No Samples in VCF")
 4091            return False
 4092        log.debug(f"Samples: {samples}")
 4093
 4094        # Memory limit
 4095        memory_limit = self.get_memory("8G")
 4096        log.debug(f"memory_limit: {memory_limit}")
 4097
 4098        # Exomiser java options
 4099        exomiser_java_options = (
 4100            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4101        )
 4102        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4103
 4104        # Download Exomiser (if not exists)
 4105        exomiser_release = param_exomiser.get("release", None)
 4106        exomiser_application_properties = param_exomiser.get(
 4107            "exomiser_application_properties", None
 4108        )
 4109        databases_download_exomiser(
 4110            assemblies=[assembly],
 4111            exomiser_folder=databases_folders,
 4112            exomiser_release=exomiser_release,
 4113            exomiser_phenotype_release=exomiser_release,
 4114            exomiser_application_properties=exomiser_application_properties,
 4115        )
 4116
 4117        # Force annotation
 4118        force_update_annotation = True
 4119
 4120        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4121            log.debug("Start annotation Exomiser")
 4122
 4123            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4124
 4125                # tmp_dir = "/tmp/exomiser"
 4126
 4127                ### ANALYSIS ###
 4128                ################
 4129
 4130                # Create analysis.json through analysis dict
 4131                # either analysis in param or by default
 4132                # depending on preset exome/genome)
 4133
 4134                # Init analysis dict
 4135                param_exomiser_analysis_dict = {}
 4136
 4137                # analysis from param
 4138                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4139                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4140
 4141                # If analysis in param -> load anlaysis json
 4142                if param_exomiser_analysis:
 4143
 4144                    # If param analysis is a file and exists
 4145                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4146                        param_exomiser_analysis
 4147                    ):
 4148                        # Load analysis file into analysis dict (either yaml or json)
 4149                        with open(param_exomiser_analysis) as json_file:
 4150                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4151
 4152                    # If param analysis is a dict
 4153                    elif isinstance(param_exomiser_analysis, dict):
 4154                        # Load analysis dict into analysis dict (either yaml or json)
 4155                        param_exomiser_analysis_dict = param_exomiser_analysis
 4156
 4157                    # Error analysis type
 4158                    else:
 4159                        log.error(f"Analysis type unknown. Check param file.")
 4160                        raise ValueError(f"Analysis type unknown. Check param file.")
 4161
 4162                # Case no input analysis config file/dict
 4163                # Use preset (exome/genome) to open default config file
 4164                if not param_exomiser_analysis_dict:
 4165
 4166                    # default preset
 4167                    default_preset = "exome"
 4168
 4169                    # Get param preset or default preset
 4170                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4171
 4172                    # Try to find if preset is a file
 4173                    if os.path.exists(param_exomiser_preset):
 4174                        # Preset file is provided in full path
 4175                        param_exomiser_analysis_default_config_file = (
 4176                            param_exomiser_preset
 4177                        )
 4178                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4179                    #     # Preset file is provided in full path
 4180                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4181                    elif os.path.exists(
 4182                        os.path.join(folder_config, param_exomiser_preset)
 4183                    ):
 4184                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4185                        param_exomiser_analysis_default_config_file = os.path.join(
 4186                            folder_config, param_exomiser_preset
 4187                        )
 4188                    else:
 4189                        # Construct preset file
 4190                        param_exomiser_analysis_default_config_file = os.path.join(
 4191                            folder_config,
 4192                            f"preset-{param_exomiser_preset}-analysis.json",
 4193                        )
 4194
 4195                    # If preset file exists
 4196                    param_exomiser_analysis_default_config_file = full_path(
 4197                        param_exomiser_analysis_default_config_file
 4198                    )
 4199                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4200                        # Load prest file into analysis dict (either yaml or json)
 4201                        with open(
 4202                            param_exomiser_analysis_default_config_file
 4203                        ) as json_file:
 4204                            # param_exomiser_analysis_dict[""] = json.load(json_file)
 4205                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4206                                json_file
 4207                            )
 4208
 4209                    # Error preset file
 4210                    else:
 4211                        log.error(
 4212                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4213                        )
 4214                        raise ValueError(
 4215                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4216                        )
 4217
 4218                # If no analysis dict created
 4219                if not param_exomiser_analysis_dict:
 4220                    log.error(f"No analysis config")
 4221                    raise ValueError(f"No analysis config")
 4222
 4223                # Log
 4224                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4225
 4226                ### PHENOPACKET ###
 4227                ###################
 4228
 4229                # If no PhenoPacket in analysis dict -> check in param
 4230                if "phenopacket" not in param_exomiser_analysis_dict:
 4231
 4232                    # If PhenoPacket in param -> load anlaysis json
 4233                    if param_exomiser.get("phenopacket", None):
 4234
 4235                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4236                        param_exomiser_phenopacket = full_path(
 4237                            param_exomiser_phenopacket
 4238                        )
 4239
 4240                        # If param phenopacket is a file and exists
 4241                        if isinstance(
 4242                            param_exomiser_phenopacket, str
 4243                        ) and os.path.exists(param_exomiser_phenopacket):
 4244                            # Load phenopacket file into analysis dict (either yaml or json)
 4245                            with open(param_exomiser_phenopacket) as json_file:
 4246                                param_exomiser_analysis_dict["phenopacket"] = (
 4247                                    yaml.safe_load(json_file)
 4248                                )
 4249
 4250                        # If param phenopacket is a dict
 4251                        elif isinstance(param_exomiser_phenopacket, dict):
 4252                            # Load phenopacket dict into analysis dict (either yaml or json)
 4253                            param_exomiser_analysis_dict["phenopacket"] = (
 4254                                param_exomiser_phenopacket
 4255                            )
 4256
 4257                        # Error phenopacket type
 4258                        else:
 4259                            log.error(f"Phenopacket type unknown. Check param file.")
 4260                            raise ValueError(
 4261                                f"Phenopacket type unknown. Check param file."
 4262                            )
 4263
 4264                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4265                if "phenopacket" not in param_exomiser_analysis_dict:
 4266
 4267                    # Init PhenoPacket
 4268                    param_exomiser_analysis_dict["phenopacket"] = {
 4269                        "id": "analysis",
 4270                        "proband": {},
 4271                    }
 4272
 4273                    ### Add subject ###
 4274
 4275                    # If subject exists
 4276                    param_exomiser_subject = param_exomiser.get("subject", {})
 4277
 4278                    # If subject not exists -> found sample ID
 4279                    if not param_exomiser_subject:
 4280
 4281                        # Found sample ID in param
 4282                        sample = param_exomiser.get("sample", None)
 4283
 4284                        # Find sample ID (first sample)
 4285                        if not sample:
 4286                            sample_list = self.get_header_sample_list()
 4287                            if len(sample_list) > 0:
 4288                                sample = sample_list[0]
 4289                            else:
 4290                                log.error(f"No sample found")
 4291                                raise ValueError(f"No sample found")
 4292
 4293                        # Create subject
 4294                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4295
 4296                    # Add to dict
 4297                    param_exomiser_analysis_dict["phenopacket"][
 4298                        "subject"
 4299                    ] = param_exomiser_subject
 4300
 4301                    ### Add "phenotypicFeatures" ###
 4302
 4303                    # If phenotypicFeatures exists
 4304                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4305                        "phenotypicFeatures", []
 4306                    )
 4307
 4308                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4309                    if not param_exomiser_phenotypicfeatures:
 4310
 4311                        # Found HPO in param
 4312                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4313
 4314                        # Split HPO if list in string format separated by comma
 4315                        if isinstance(param_exomiser_hpo, str):
 4316                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4317
 4318                        # Create HPO list
 4319                        for hpo in param_exomiser_hpo:
 4320                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4321                            param_exomiser_phenotypicfeatures.append(
 4322                                {
 4323                                    "type": {
 4324                                        "id": f"HP:{hpo_clean}",
 4325                                        "label": f"HP:{hpo_clean}",
 4326                                    }
 4327                                }
 4328                            )
 4329
 4330                    # Add to dict
 4331                    param_exomiser_analysis_dict["phenopacket"][
 4332                        "phenotypicFeatures"
 4333                    ] = param_exomiser_phenotypicfeatures
 4334
 4335                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4336                    if not param_exomiser_phenotypicfeatures:
 4337                        for step in param_exomiser_analysis_dict.get(
 4338                            "analysis", {}
 4339                        ).get("steps", []):
 4340                            if "hiPhivePrioritiser" in step:
 4341                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4342                                    "steps", []
 4343                                ).remove(step)
 4344
 4345                ### Add Input File ###
 4346
 4347                # Initial file name and htsFiles
 4348                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4349                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4350                    {
 4351                        "uri": tmp_vcf_name,
 4352                        "htsFormat": "VCF",
 4353                        "genomeAssembly": assembly,
 4354                    }
 4355                ]
 4356
 4357                ### Add metaData ###
 4358
 4359                # If metaData not in analysis dict
 4360                if "metaData" not in param_exomiser_analysis_dict:
 4361                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4362                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4363                        "createdBy": "howard",
 4364                        "phenopacketSchemaVersion": 1,
 4365                    }
 4366
 4367                ### OutputOptions ###
 4368
 4369                # Init output result folder
 4370                output_results = os.path.join(tmp_dir, "results")
 4371
 4372                # If no outputOptions in analysis dict
 4373                if "outputOptions" not in param_exomiser_analysis_dict:
 4374
 4375                    # default output formats
 4376                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4377
 4378                    # Get outputOptions in param
 4379                    output_options = param_exomiser.get("outputOptions", None)
 4380
 4381                    # If no output_options in param -> check
 4382                    if not output_options:
 4383                        output_options = {
 4384                            "outputContributingVariantsOnly": False,
 4385                            "numGenes": 0,
 4386                            "outputFormats": defaut_output_formats,
 4387                        }
 4388
 4389                    # Replace outputDirectory in output options
 4390                    output_options["outputDirectory"] = output_results
 4391                    output_options["outputFileName"] = "howard"
 4392
 4393                    # Add outputOptions in analysis dict
 4394                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4395
 4396                else:
 4397
 4398                    # Replace output_results and output format (if exists in param)
 4399                    param_exomiser_analysis_dict["outputOptions"][
 4400                        "outputDirectory"
 4401                    ] = output_results
 4402                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4403                        list(
 4404                            set(
 4405                                param_exomiser_analysis_dict.get(
 4406                                    "outputOptions", {}
 4407                                ).get("outputFormats", [])
 4408                                + ["TSV_VARIANT", "VCF"]
 4409                            )
 4410                        )
 4411                    )
 4412
 4413                # log
 4414                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4415
 4416                ### ANALYSIS FILE ###
 4417                #####################
 4418
 4419                ### Full JSON analysis config file ###
 4420
 4421                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4422                with open(exomiser_analysis, "w") as fp:
 4423                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4424
 4425                ### SPLIT analysis and sample config files
 4426
 4427                # Splitted analysis dict
 4428                param_exomiser_analysis_dict_for_split = (
 4429                    param_exomiser_analysis_dict.copy()
 4430                )
 4431
 4432                # Phenopacket JSON file
 4433                exomiser_analysis_phenopacket = os.path.join(
 4434                    tmp_dir, "analysis_phenopacket.json"
 4435                )
 4436                with open(exomiser_analysis_phenopacket, "w") as fp:
 4437                    json.dump(
 4438                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4439                        fp,
 4440                        indent=4,
 4441                    )
 4442
 4443                # Analysis JSON file without Phenopacket parameters
 4444                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4445                exomiser_analysis_analysis = os.path.join(
 4446                    tmp_dir, "analysis_analysis.json"
 4447                )
 4448                with open(exomiser_analysis_analysis, "w") as fp:
 4449                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4450
 4451                ### INITAL VCF file ###
 4452                #######################
 4453
 4454                ### Create list of samples to use and include inti initial VCF file ####
 4455
 4456                # Subject (main sample)
 4457                # Get sample ID in analysis dict
 4458                sample_subject = (
 4459                    param_exomiser_analysis_dict.get("phenopacket", {})
 4460                    .get("subject", {})
 4461                    .get("id", None)
 4462                )
 4463                sample_proband = (
 4464                    param_exomiser_analysis_dict.get("phenopacket", {})
 4465                    .get("proband", {})
 4466                    .get("subject", {})
 4467                    .get("id", None)
 4468                )
 4469                sample = []
 4470                if sample_subject:
 4471                    sample.append(sample_subject)
 4472                if sample_proband:
 4473                    sample.append(sample_proband)
 4474
 4475                # Get sample ID within Pedigree
 4476                pedigree_persons_list = (
 4477                    param_exomiser_analysis_dict.get("phenopacket", {})
 4478                    .get("pedigree", {})
 4479                    .get("persons", {})
 4480                )
 4481
 4482                # Create list with all sample ID in pedigree (if exists)
 4483                pedigree_persons = []
 4484                for person in pedigree_persons_list:
 4485                    pedigree_persons.append(person.get("individualId"))
 4486
 4487                # Concat subject sample ID and samples ID in pedigreesamples
 4488                samples = list(set(sample + pedigree_persons))
 4489
 4490                # Check if sample list is not empty
 4491                if not samples:
 4492                    log.error(f"No samples found")
 4493                    raise ValueError(f"No samples found")
 4494
 4495                # Create VCF with sample (either sample in param or first one by default)
 4496                # Export VCF file
 4497                self.export_variant_vcf(
 4498                    vcf_file=tmp_vcf_name,
 4499                    remove_info=True,
 4500                    add_samples=True,
 4501                    list_samples=samples,
 4502                    index=False,
 4503                )
 4504
 4505                ### Execute Exomiser ###
 4506                ########################
 4507
 4508                # Init command
 4509                exomiser_command = ""
 4510
 4511                # Command exomiser options
 4512                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 4513
 4514                # Release
 4515                exomiser_release = param_exomiser.get("release", None)
 4516                if exomiser_release:
 4517                    # phenotype data version
 4518                    exomiser_options += (
 4519                        f" --exomiser.phenotype.data-version={exomiser_release} "
 4520                    )
 4521                    # data version
 4522                    exomiser_options += (
 4523                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 4524                    )
 4525                    # variant white list
 4526                    variant_white_list_file = (
 4527                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 4528                    )
 4529                    if os.path.exists(
 4530                        os.path.join(
 4531                            databases_folders, assembly, variant_white_list_file
 4532                        )
 4533                    ):
 4534                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 4535
 4536                # transcript_source
 4537                transcript_source = param_exomiser.get(
 4538                    "transcript_source", None
 4539                )  # ucsc, refseq, ensembl
 4540                if transcript_source:
 4541                    exomiser_options += (
 4542                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 4543                    )
 4544
 4545                # If analysis contain proband param
 4546                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 4547                    "proband", {}
 4548                ):
 4549                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 4550
 4551                # If no proband (usually uniq sample)
 4552                else:
 4553                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 4554
 4555                # Log
 4556                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 4557
 4558                # Run command
 4559                result = subprocess.call(
 4560                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 4561                )
 4562                if result:
 4563                    log.error("Exomiser command failed")
 4564                    raise ValueError("Exomiser command failed")
 4565
 4566                ### RESULTS ###
 4567                ###############
 4568
 4569                ### Annotate with TSV fields ###
 4570
 4571                # Init result tsv file
 4572                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 4573
 4574                # Init result tsv file
 4575                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 4576
 4577                # Parse TSV file and explode columns in INFO field
 4578                if exomiser_to_info and os.path.exists(output_results_tsv):
 4579
 4580                    # Log
 4581                    log.debug("Exomiser columns to VCF INFO field")
 4582
 4583                    # Retrieve columns and types
 4584                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 4585                    output_results_tsv_df = self.get_query_to_df(query)
 4586                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 4587
 4588                    # Init concat fields for update
 4589                    sql_query_update_concat_fields = []
 4590
 4591                    # Fields to avoid
 4592                    fields_to_avoid = [
 4593                        "CONTIG",
 4594                        "START",
 4595                        "END",
 4596                        "REF",
 4597                        "ALT",
 4598                        "QUAL",
 4599                        "FILTER",
 4600                        "GENOTYPE",
 4601                    ]
 4602
 4603                    # List all columns to add into header
 4604                    for header_column in output_results_tsv_columns:
 4605
 4606                        # If header column is enable
 4607                        if header_column not in fields_to_avoid:
 4608
 4609                            # Header info type
 4610                            header_info_type = "String"
 4611                            header_column_df = output_results_tsv_df[header_column]
 4612                            header_column_df_dtype = header_column_df.dtype
 4613                            if header_column_df_dtype == object:
 4614                                if (
 4615                                    pd.to_numeric(header_column_df, errors="coerce")
 4616                                    .notnull()
 4617                                    .all()
 4618                                ):
 4619                                    header_info_type = "Float"
 4620                            else:
 4621                                header_info_type = "Integer"
 4622
 4623                            # Header info
 4624                            characters_to_validate = ["-"]
 4625                            pattern = "[" + "".join(characters_to_validate) + "]"
 4626                            header_info_name = re.sub(
 4627                                pattern,
 4628                                "_",
 4629                                f"Exomiser_{header_column}".replace("#", ""),
 4630                            )
 4631                            header_info_number = "."
 4632                            header_info_description = (
 4633                                f"Exomiser {header_column} annotation"
 4634                            )
 4635                            header_info_source = "Exomiser"
 4636                            header_info_version = "unknown"
 4637                            header_info_code = CODE_TYPE_MAP[header_info_type]
 4638                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 4639                                header_info_name,
 4640                                header_info_number,
 4641                                header_info_type,
 4642                                header_info_description,
 4643                                header_info_source,
 4644                                header_info_version,
 4645                                header_info_code,
 4646                            )
 4647
 4648                            # Add field to add for update to concat fields
 4649                            sql_query_update_concat_fields.append(
 4650                                f"""
 4651                                CASE
 4652                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 4653                                    THEN concat(
 4654                                        '{header_info_name}=',
 4655                                        table_parquet."{header_column}",
 4656                                        ';'
 4657                                        )
 4658
 4659                                    ELSE ''
 4660                                END
 4661                            """
 4662                            )
 4663
 4664                    # Update query
 4665                    sql_query_update = f"""
 4666                        UPDATE {table_variants} as table_variants
 4667                            SET INFO = concat(
 4668                                            CASE
 4669                                                WHEN INFO NOT IN ('', '.')
 4670                                                THEN INFO
 4671                                                ELSE ''
 4672                                            END,
 4673                                            CASE
 4674                                                WHEN table_variants.INFO NOT IN ('','.')
 4675                                                THEN ';'
 4676                                                ELSE ''
 4677                                            END,
 4678                                            (
 4679                                            SELECT 
 4680                                                concat(
 4681                                                    {",".join(sql_query_update_concat_fields)}
 4682                                                )
 4683                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 4684                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 4685                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 4686                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 4687                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 4688                                            )
 4689                                        )
 4690                            ;
 4691                        """
 4692
 4693                    # Update
 4694                    self.conn.execute(sql_query_update)
 4695
 4696                ### Annotate with VCF INFO field ###
 4697
 4698                # Init result VCF file
 4699                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 4700
 4701                # If VCF exists
 4702                if os.path.exists(output_results_vcf):
 4703
 4704                    # Log
 4705                    log.debug("Exomiser result VCF update variants")
 4706
 4707                    # Find Exomiser INFO field annotation in header
 4708                    with gzip.open(output_results_vcf, "rt") as f:
 4709                        header_list = self.read_vcf_header(f)
 4710                    exomiser_vcf_header = vcf.Reader(
 4711                        io.StringIO("\n".join(header_list))
 4712                    )
 4713
 4714                    # Add annotation INFO field to header
 4715                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 4716
 4717                    # Update variants with VCF
 4718                    self.update_from_vcf(output_results_vcf)
 4719
 4720        return True
 4721
 4722    def annotation_snpeff(self, threads: int = None) -> None:
 4723        """
 4724        This function annotate with snpEff
 4725
 4726        :param threads: The number of threads to use
 4727        :return: the value of the variable "return_value".
 4728        """
 4729
 4730        # DEBUG
 4731        log.debug("Start annotation with snpeff databases")
 4732
 4733        # Threads
 4734        if not threads:
 4735            threads = self.get_threads()
 4736        log.debug("Threads: " + str(threads))
 4737
 4738        # DEBUG
 4739        delete_tmp = True
 4740        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4741            delete_tmp = False
 4742            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4743
 4744        # Config
 4745        config = self.get_config()
 4746        log.debug("Config: " + str(config))
 4747
 4748        # Config - Folders - Databases
 4749        databases_folders = (
 4750            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 4751        )
 4752        log.debug("Databases annotations: " + str(databases_folders))
 4753
 4754        # # Config - Java
 4755        # java_bin = get_bin(
 4756        #     tool="java",
 4757        #     bin="java",
 4758        #     bin_type="bin",
 4759        #     config=config,
 4760        #     default_folder="/usr/bin",
 4761        # )
 4762        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
 4763        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
 4764        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
 4765
 4766        # # Config - snpEff bin
 4767        # snpeff_jar = get_bin(
 4768        #     tool="snpeff",
 4769        #     bin="snpEff.jar",
 4770        #     bin_type="jar",
 4771        #     config=config,
 4772        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 4773        # )
 4774        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
 4775        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
 4776        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
 4777
 4778        # Config - snpEff bin command
 4779        snpeff_bin_command = get_bin_command(
 4780            bin="snpEff.jar",
 4781            tool="snpeff",
 4782            bin_type="jar",
 4783            config=config,
 4784            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 4785        )
 4786        if not snpeff_bin_command:
 4787            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 4788            log.error(msg_err)
 4789            raise ValueError(msg_err)
 4790
 4791        # Config - snpEff databases
 4792        snpeff_databases = (
 4793            config.get("folders", {})
 4794            .get("databases", {})
 4795            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 4796        )
 4797        snpeff_databases = full_path(snpeff_databases)
 4798        if snpeff_databases is not None and snpeff_databases != "":
 4799            log.debug(f"Create snpEff databases folder")
 4800            if not os.path.exists(snpeff_databases):
 4801                os.makedirs(snpeff_databases)
 4802
 4803        # Param
 4804        param = self.get_param()
 4805        log.debug("Param: " + str(param))
 4806
 4807        # Param
 4808        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 4809        log.debug("Options: " + str(options))
 4810
 4811        # Param - Assembly
 4812        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4813
 4814        # Param - Options
 4815        snpeff_options = (
 4816            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 4817        )
 4818        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 4819        snpeff_csvstats = (
 4820            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 4821        )
 4822        if snpeff_stats:
 4823            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 4824            snpeff_stats = full_path(snpeff_stats)
 4825            snpeff_options += f" -stats {snpeff_stats}"
 4826        if snpeff_csvstats:
 4827            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 4828            snpeff_csvstats = full_path(snpeff_csvstats)
 4829            snpeff_options += f" -csvStats {snpeff_csvstats}"
 4830
 4831        # Data
 4832        table_variants = self.get_table_variants()
 4833
 4834        # Check if not empty
 4835        log.debug("Check if not empty")
 4836        sql_query_chromosomes = (
 4837            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4838        )
 4839        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 4840        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4841            log.info(f"VCF empty")
 4842            return
 4843
 4844        # Export in VCF
 4845        log.debug("Create initial file to annotate")
 4846        tmp_vcf = NamedTemporaryFile(
 4847            prefix=self.get_prefix(),
 4848            dir=self.get_tmp_dir(),
 4849            suffix=".vcf.gz",
 4850            delete=True,
 4851        )
 4852        tmp_vcf_name = tmp_vcf.name
 4853
 4854        # VCF header
 4855        vcf_reader = self.get_header()
 4856        log.debug("Initial header: " + str(vcf_reader.infos))
 4857
 4858        # Existing annotations
 4859        for vcf_annotation in self.get_header().infos:
 4860
 4861            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4862            log.debug(
 4863                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4864            )
 4865
 4866        # Memory limit
 4867        # if config.get("memory", None):
 4868        #     memory_limit = config.get("memory", "8G")
 4869        # else:
 4870        #     memory_limit = "8G"
 4871        memory_limit = self.get_memory("8G")
 4872        log.debug(f"memory_limit: {memory_limit}")
 4873
 4874        # snpEff java options
 4875        snpeff_java_options = (
 4876            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4877        )
 4878        log.debug(f"Exomiser java options: {snpeff_java_options}")
 4879
 4880        force_update_annotation = True
 4881
 4882        if "ANN" not in self.get_header().infos or force_update_annotation:
 4883
 4884            # Check snpEff database
 4885            log.debug(f"Check snpEff databases {[assembly]}")
 4886            databases_download_snpeff(
 4887                folder=snpeff_databases, assemblies=[assembly], config=config
 4888            )
 4889
 4890            # Export VCF file
 4891            self.export_variant_vcf(
 4892                vcf_file=tmp_vcf_name,
 4893                remove_info=True,
 4894                add_samples=False,
 4895                index=True,
 4896            )
 4897
 4898            # Tmp file
 4899            err_files = []
 4900            tmp_annotate_vcf = NamedTemporaryFile(
 4901                prefix=self.get_prefix(),
 4902                dir=self.get_tmp_dir(),
 4903                suffix=".vcf",
 4904                delete=False,
 4905            )
 4906            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4907            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4908            err_files.append(tmp_annotate_vcf_name_err)
 4909
 4910            # Command
 4911            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 4912            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 4913            run_parallel_commands([snpeff_command], 1)
 4914
 4915            # Error messages
 4916            log.info(f"Error/Warning messages:")
 4917            error_message_command_all = []
 4918            error_message_command_warning = []
 4919            error_message_command_err = []
 4920            for err_file in err_files:
 4921                with open(err_file, "r") as f:
 4922                    for line in f:
 4923                        message = line.strip()
 4924                        error_message_command_all.append(message)
 4925                        if line.startswith("[W::"):
 4926                            error_message_command_warning.append(message)
 4927                        if line.startswith("[E::"):
 4928                            error_message_command_err.append(f"{err_file}: " + message)
 4929            # log info
 4930            for message in list(
 4931                set(error_message_command_err + error_message_command_warning)
 4932            ):
 4933                log.info(f"   {message}")
 4934            # debug info
 4935            for message in list(set(error_message_command_all)):
 4936                log.debug(f"   {message}")
 4937            # failed
 4938            if len(error_message_command_err):
 4939                log.error("Annotation failed: Error in commands")
 4940                raise ValueError("Annotation failed: Error in commands")
 4941
 4942            # Find annotation in header
 4943            with open(tmp_annotate_vcf_name, "rt") as f:
 4944                header_list = self.read_vcf_header(f)
 4945            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 4946
 4947            for ann in annovar_vcf_header.infos:
 4948                if ann not in self.get_header().infos:
 4949                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 4950
 4951            # Update variants
 4952            log.info(f"Annotation - Updating...")
 4953            self.update_from_vcf(tmp_annotate_vcf_name)
 4954
 4955        else:
 4956            if "ANN" in self.get_header().infos:
 4957                log.debug(f"Existing snpEff annotations in VCF")
 4958            if force_update_annotation:
 4959                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 4960
 4961    def annotation_annovar(self, threads: int = None) -> None:
 4962        """
 4963        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 4964        annotations
 4965
 4966        :param threads: number of threads to use
 4967        :return: the value of the variable "return_value".
 4968        """
 4969
 4970        # DEBUG
 4971        log.debug("Start annotation with Annovar databases")
 4972
 4973        # Threads
 4974        if not threads:
 4975            threads = self.get_threads()
 4976        log.debug("Threads: " + str(threads))
 4977
 4978        # Tmp en Err files
 4979        tmp_files = []
 4980        err_files = []
 4981
 4982        # DEBUG
 4983        delete_tmp = True
 4984        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4985            delete_tmp = False
 4986            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4987
 4988        # Config
 4989        config = self.get_config()
 4990        log.debug("Config: " + str(config))
 4991
 4992        # Config - Folders - Databases
 4993        databases_folders = (
 4994            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 4995        )
 4996        log.debug("Databases annotations: " + str(databases_folders))
 4997
 4998        # Config - annovar bin command
 4999        annovar_bin_command = get_bin_command(
 5000            bin="table_annovar.pl",
 5001            tool="annovar",
 5002            bin_type="perl",
 5003            config=config,
 5004            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5005        )
 5006        if not annovar_bin_command:
 5007            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5008            log.error(msg_err)
 5009            raise ValueError(msg_err)
 5010
 5011        # Config - BCFTools bin command
 5012        bcftools_bin_command = get_bin_command(
 5013            bin="bcftools",
 5014            tool="bcftools",
 5015            bin_type="bin",
 5016            config=config,
 5017            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5018        )
 5019        if not bcftools_bin_command:
 5020            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5021            log.error(msg_err)
 5022            raise ValueError(msg_err)
 5023
 5024        # Config - annovar databases
 5025        annovar_databases = (
 5026            config.get("folders", {})
 5027            .get("databases", {})
 5028            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5029        )
 5030        annovar_databases = full_path(annovar_databases)
 5031        if annovar_databases != "" and not os.path.exists(annovar_databases):
 5032            os.makedirs(annovar_databases)
 5033
 5034        # Param
 5035        param = self.get_param()
 5036        log.debug("Param: " + str(param))
 5037
 5038        # Param - options
 5039        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5040        log.debug("Options: " + str(options))
 5041
 5042        # Param - annotations
 5043        annotations = (
 5044            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5045        )
 5046        log.debug("Annotations: " + str(annotations))
 5047
 5048        # Param - Assembly
 5049        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5050
 5051        # Annovar database assembly
 5052        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5053        if annovar_databases_assembly != "" and not os.path.exists(
 5054            annovar_databases_assembly
 5055        ):
 5056            os.makedirs(annovar_databases_assembly)
 5057
 5058        # Data
 5059        table_variants = self.get_table_variants()
 5060
 5061        # Check if not empty
 5062        log.debug("Check if not empty")
 5063        sql_query_chromosomes = (
 5064            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5065        )
 5066        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5067        if not sql_query_chromosomes_df["count"][0]:
 5068            log.info(f"VCF empty")
 5069            return
 5070
 5071        # VCF header
 5072        vcf_reader = self.get_header()
 5073        log.debug("Initial header: " + str(vcf_reader.infos))
 5074
 5075        # Existing annotations
 5076        for vcf_annotation in self.get_header().infos:
 5077
 5078            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5079            log.debug(
 5080                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5081            )
 5082
 5083        force_update_annotation = True
 5084
 5085        if annotations:
 5086
 5087            commands = []
 5088            tmp_annotates_vcf_name_list = []
 5089
 5090            # Export in VCF
 5091            log.debug("Create initial file to annotate")
 5092            tmp_vcf = NamedTemporaryFile(
 5093                prefix=self.get_prefix(),
 5094                dir=self.get_tmp_dir(),
 5095                suffix=".vcf.gz",
 5096                delete=False,
 5097            )
 5098            tmp_vcf_name = tmp_vcf.name
 5099            tmp_files.append(tmp_vcf_name)
 5100            tmp_files.append(tmp_vcf_name + ".tbi")
 5101
 5102            # Export VCF file
 5103            self.export_variant_vcf(
 5104                vcf_file=tmp_vcf_name,
 5105                remove_info=".",
 5106                add_samples=False,
 5107                index=True,
 5108            )
 5109
 5110            # Create file for field rename
 5111            log.debug("Create file for field rename")
 5112            tmp_rename = NamedTemporaryFile(
 5113                prefix=self.get_prefix(),
 5114                dir=self.get_tmp_dir(),
 5115                suffix=".rename",
 5116                delete=False,
 5117            )
 5118            tmp_rename_name = tmp_rename.name
 5119            tmp_files.append(tmp_rename_name)
 5120
 5121            # Check Annovar database
 5122            log.debug(
 5123                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5124            )
 5125            databases_download_annovar(
 5126                folder=annovar_databases,
 5127                files=list(annotations.keys()),
 5128                assemblies=[assembly],
 5129            )
 5130
 5131            for annotation in annotations:
 5132                annotation_fields = annotations[annotation]
 5133
 5134                if not annotation_fields:
 5135                    annotation_fields = {"INFO": None}
 5136
 5137                log.info(f"Annotations Annovar - database '{annotation}'")
 5138                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5139
 5140                # Tmp file for annovar
 5141                err_files = []
 5142                tmp_annotate_vcf_directory = TemporaryDirectory(
 5143                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5144                )
 5145                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5146                tmp_annotate_vcf_name_annovar = (
 5147                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5148                )
 5149                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5150                err_files.append(tmp_annotate_vcf_name_err)
 5151                tmp_files.append(tmp_annotate_vcf_name_err)
 5152
 5153                # Tmp file final vcf annotated by annovar
 5154                tmp_annotate_vcf = NamedTemporaryFile(
 5155                    prefix=self.get_prefix(),
 5156                    dir=self.get_tmp_dir(),
 5157                    suffix=".vcf.gz",
 5158                    delete=False,
 5159                )
 5160                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5161                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5162                tmp_files.append(tmp_annotate_vcf_name)
 5163                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5164
 5165                # Number of fields
 5166                annotation_list = []
 5167                annotation_renamed_list = []
 5168
 5169                for annotation_field in annotation_fields:
 5170
 5171                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5172                    annotation_fields_new_name = annotation_fields.get(
 5173                        annotation_field, annotation_field
 5174                    )
 5175                    if not annotation_fields_new_name:
 5176                        annotation_fields_new_name = annotation_field
 5177
 5178                    if (
 5179                        force_update_annotation
 5180                        or annotation_fields_new_name not in self.get_header().infos
 5181                    ):
 5182                        annotation_list.append(annotation_field)
 5183                        annotation_renamed_list.append(annotation_fields_new_name)
 5184                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5185                        log.warning(
 5186                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5187                        )
 5188
 5189                    # Add rename info
 5190                    run_parallel_commands(
 5191                        [
 5192                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5193                        ],
 5194                        1,
 5195                    )
 5196
 5197                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5198                log.debug("annotation_list: " + str(annotation_list))
 5199
 5200                # protocol
 5201                protocol = annotation
 5202
 5203                # argument
 5204                argument = ""
 5205
 5206                # operation
 5207                operation = "f"
 5208                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5209                    "ensGene"
 5210                ):
 5211                    operation = "g"
 5212                    if options.get("genebase", None):
 5213                        argument = f"""'{options.get("genebase","")}'"""
 5214                elif annotation in ["cytoBand"]:
 5215                    operation = "r"
 5216
 5217                # argument option
 5218                argument_option = ""
 5219                if argument != "":
 5220                    argument_option = " --argument " + argument
 5221
 5222                # command options
 5223                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5224                for option in options:
 5225                    if option not in ["genebase"]:
 5226                        command_options += f""" --{option}={options[option]}"""
 5227
 5228                # Command
 5229
 5230                # Command - Annovar
 5231                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5232                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5233
 5234                # Command - start pipe
 5235                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5236
 5237                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5238                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5239
 5240                # Command - Special characters (refGene annotation)
 5241                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5242
 5243                # Command - Clean empty fields (with value ".")
 5244                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5245
 5246                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5247                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5248                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5249                    # for ann in annotation_renamed_list:
 5250                    for ann in annotation_list:
 5251                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5252
 5253                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5254
 5255                # Command - indexing
 5256                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5257
 5258                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5259                run_parallel_commands([command_annovar], 1)
 5260
 5261                # Error messages
 5262                log.info(f"Error/Warning messages:")
 5263                error_message_command_all = []
 5264                error_message_command_warning = []
 5265                error_message_command_err = []
 5266                for err_file in err_files:
 5267                    with open(err_file, "r") as f:
 5268                        for line in f:
 5269                            message = line.strip()
 5270                            error_message_command_all.append(message)
 5271                            if line.startswith("[W::") or line.startswith("WARNING"):
 5272                                error_message_command_warning.append(message)
 5273                            if line.startswith("[E::") or line.startswith("ERROR"):
 5274                                error_message_command_err.append(
 5275                                    f"{err_file}: " + message
 5276                                )
 5277                # log info
 5278                for message in list(
 5279                    set(error_message_command_err + error_message_command_warning)
 5280                ):
 5281                    log.info(f"   {message}")
 5282                # debug info
 5283                for message in list(set(error_message_command_all)):
 5284                    log.debug(f"   {message}")
 5285                # failed
 5286                if len(error_message_command_err):
 5287                    log.error("Annotation failed: Error in commands")
 5288                    raise ValueError("Annotation failed: Error in commands")
 5289
 5290            if tmp_annotates_vcf_name_list:
 5291
 5292                # List of annotated files
 5293                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5294
 5295                # Tmp file
 5296                tmp_annotate_vcf = NamedTemporaryFile(
 5297                    prefix=self.get_prefix(),
 5298                    dir=self.get_tmp_dir(),
 5299                    suffix=".vcf.gz",
 5300                    delete=False,
 5301                )
 5302                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5303                tmp_files.append(tmp_annotate_vcf_name)
 5304                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5305                err_files.append(tmp_annotate_vcf_name_err)
 5306                tmp_files.append(tmp_annotate_vcf_name_err)
 5307
 5308                # Command merge
 5309                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5310                log.info(
 5311                    f"Annotation Annovar - Annotation merging "
 5312                    + str(len(tmp_annotates_vcf_name_list))
 5313                    + " annotated files"
 5314                )
 5315                log.debug(f"Annotation - merge command: {merge_command}")
 5316                run_parallel_commands([merge_command], 1)
 5317
 5318                # Find annotation in header
 5319                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5320                    header_list = self.read_vcf_header(f)
 5321                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5322
 5323                for ann in annovar_vcf_header.infos:
 5324                    if ann not in self.get_header().infos:
 5325                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5326
 5327                # Update variants
 5328                log.info(f"Annotation Annovar - Updating...")
 5329                self.update_from_vcf(tmp_annotate_vcf_name)
 5330
 5331            # Clean files
 5332            # Tmp file remove command
 5333            if True:
 5334                tmp_files_remove_command = ""
 5335                if tmp_files:
 5336                    tmp_files_remove_command = " ".join(tmp_files)
 5337                clean_command = f" rm -f {tmp_files_remove_command} "
 5338                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5339                log.debug(f"Annotation - cleaning command: {clean_command}")
 5340                run_parallel_commands([clean_command], 1)
 5341
 5342    # Parquet
 5343    def annotation_parquet(self, threads: int = None) -> None:
 5344        """
 5345        It takes a VCF file, and annotates it with a parquet file
 5346
 5347        :param threads: number of threads to use for the annotation
 5348        :return: the value of the variable "result".
 5349        """
 5350
 5351        # DEBUG
 5352        log.debug("Start annotation with parquet databases")
 5353
 5354        # Threads
 5355        if not threads:
 5356            threads = self.get_threads()
 5357        log.debug("Threads: " + str(threads))
 5358
 5359        # DEBUG
 5360        delete_tmp = True
 5361        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5362            delete_tmp = False
 5363            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5364
 5365        # Config
 5366        databases_folders = set(
 5367            self.get_config()
 5368            .get("folders", {})
 5369            .get("databases", {})
 5370            .get("annotations", ["."])
 5371            + self.get_config()
 5372            .get("folders", {})
 5373            .get("databases", {})
 5374            .get("parquet", ["."])
 5375        )
 5376        log.debug("Databases annotations: " + str(databases_folders))
 5377
 5378        # Param
 5379        annotations = (
 5380            self.get_param()
 5381            .get("annotation", {})
 5382            .get("parquet", {})
 5383            .get("annotations", None)
 5384        )
 5385        log.debug("Annotations: " + str(annotations))
 5386
 5387        # Assembly
 5388        assembly = self.get_param().get(
 5389            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5390        )
 5391
 5392        # Force Update Annotation
 5393        force_update_annotation = (
 5394            self.get_param()
 5395            .get("annotation", {})
 5396            .get("options", {})
 5397            .get("annotations_update", False)
 5398        )
 5399        log.debug(f"force_update_annotation={force_update_annotation}")
 5400        force_append_annotation = (
 5401            self.get_param()
 5402            .get("annotation", {})
 5403            .get("options", {})
 5404            .get("annotations_append", False)
 5405        )
 5406        log.debug(f"force_append_annotation={force_append_annotation}")
 5407
 5408        # Data
 5409        table_variants = self.get_table_variants()
 5410
 5411        # Check if not empty
 5412        log.debug("Check if not empty")
 5413        sql_query_chromosomes_df = self.get_query_to_df(
 5414            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5415        )
 5416        if not sql_query_chromosomes_df["count"][0]:
 5417            log.info(f"VCF empty")
 5418            return
 5419
 5420        # VCF header
 5421        vcf_reader = self.get_header()
 5422        log.debug("Initial header: " + str(vcf_reader.infos))
 5423
 5424        # Nb Variants POS
 5425        log.debug("NB Variants Start")
 5426        nb_variants = self.conn.execute(
 5427            f"SELECT count(*) AS count FROM variants"
 5428        ).fetchdf()["count"][0]
 5429        log.debug("NB Variants Stop")
 5430
 5431        # Existing annotations
 5432        for vcf_annotation in self.get_header().infos:
 5433
 5434            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5435            log.debug(
 5436                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5437            )
 5438
 5439        # Added columns
 5440        added_columns = []
 5441
 5442        # drop indexes
 5443        log.debug(f"Drop indexes...")
 5444        self.drop_indexes()
 5445
 5446        if annotations:
 5447
 5448            if "ALL" in annotations:
 5449
 5450                all_param = annotations.get("ALL", {})
 5451                all_param_formats = all_param.get("formats", None)
 5452                all_param_releases = all_param.get("releases", None)
 5453
 5454                databases_infos_dict = self.scan_databases(
 5455                    database_formats=all_param_formats,
 5456                    database_releases=all_param_releases,
 5457                )
 5458                for database_infos in databases_infos_dict.keys():
 5459                    if database_infos not in annotations:
 5460                        annotations[database_infos] = {"INFO": None}
 5461
 5462            for annotation in annotations:
 5463
 5464                if annotation in ["ALL"]:
 5465                    continue
 5466
 5467                # Annotation Name
 5468                annotation_name = os.path.basename(annotation)
 5469
 5470                # Annotation fields
 5471                annotation_fields = annotations[annotation]
 5472                if not annotation_fields:
 5473                    annotation_fields = {"INFO": None}
 5474
 5475                log.debug(f"Annotation '{annotation_name}'")
 5476                log.debug(
 5477                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5478                )
 5479
 5480                # Create Database
 5481                database = Database(
 5482                    database=annotation,
 5483                    databases_folders=databases_folders,
 5484                    assembly=assembly,
 5485                )
 5486
 5487                # Find files
 5488                parquet_file = database.get_database()
 5489                parquet_hdr_file = database.get_header_file()
 5490                parquet_type = database.get_type()
 5491
 5492                # Check if files exists
 5493                if not parquet_file or not parquet_hdr_file:
 5494                    log.error("Annotation failed: file not found")
 5495                    raise ValueError("Annotation failed: file not found")
 5496                else:
 5497                    # Get parquet connexion
 5498                    parquet_sql_attach = database.get_sql_database_attach(
 5499                        output="query"
 5500                    )
 5501                    if parquet_sql_attach:
 5502                        self.conn.execute(parquet_sql_attach)
 5503                    parquet_file_link = database.get_sql_database_link()
 5504                    # Log
 5505                    log.debug(
 5506                        f"Annotation '{annotation_name}' - file: "
 5507                        + str(parquet_file)
 5508                        + " and "
 5509                        + str(parquet_hdr_file)
 5510                    )
 5511
 5512                    # Database full header columns
 5513                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 5514                        parquet_hdr_file
 5515                    )
 5516                    # Log
 5517                    log.debug(
 5518                        "Annotation database header columns : "
 5519                        + str(parquet_hdr_vcf_header_columns)
 5520                    )
 5521
 5522                    # Load header as VCF object
 5523                    parquet_hdr_vcf_header_infos = database.get_header().infos
 5524                    # Log
 5525                    log.debug(
 5526                        "Annotation database header: "
 5527                        + str(parquet_hdr_vcf_header_infos)
 5528                    )
 5529
 5530                    # Get extra infos
 5531                    parquet_columns = database.get_extra_columns()
 5532                    # Log
 5533                    log.debug("Annotation database Columns: " + str(parquet_columns))
 5534
 5535                    # Add extra columns if "ALL" in annotation_fields
 5536                    # if "ALL" in annotation_fields:
 5537                    #     allow_add_extra_column = True
 5538                    if "ALL" in annotation_fields and database.get_extra_columns():
 5539                        for extra_column in database.get_extra_columns():
 5540                            if (
 5541                                extra_column not in annotation_fields
 5542                                and extra_column.replace("INFO/", "")
 5543                                not in parquet_hdr_vcf_header_infos
 5544                            ):
 5545                                parquet_hdr_vcf_header_infos[extra_column] = (
 5546                                    vcf.parser._Info(
 5547                                        extra_column,
 5548                                        ".",
 5549                                        "String",
 5550                                        f"{extra_column} description",
 5551                                        "unknown",
 5552                                        "unknown",
 5553                                        self.code_type_map["String"],
 5554                                    )
 5555                                )
 5556
 5557                    # For all fields in database
 5558                    annotation_fields_all = False
 5559                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 5560                        annotation_fields_all = True
 5561                        annotation_fields = {
 5562                            key: key for key in parquet_hdr_vcf_header_infos
 5563                        }
 5564
 5565                        log.debug(
 5566                            "Annotation database header - All annotations added: "
 5567                            + str(annotation_fields)
 5568                        )
 5569
 5570                    # Init
 5571
 5572                    # List of annotation fields to use
 5573                    sql_query_annotation_update_info_sets = []
 5574
 5575                    # List of annotation to agregate
 5576                    sql_query_annotation_to_agregate = []
 5577
 5578                    # Number of fields
 5579                    nb_annotation_field = 0
 5580
 5581                    # Annotation fields processed
 5582                    annotation_fields_processed = []
 5583
 5584                    # Columns mapping
 5585                    map_columns = database.map_columns(
 5586                        columns=annotation_fields, prefixes=["INFO/"]
 5587                    )
 5588
 5589                    # Query dict for fields to remove (update option)
 5590                    query_dict_remove = {}
 5591
 5592                    # Fetch Anotation fields
 5593                    for annotation_field in annotation_fields:
 5594
 5595                        # annotation_field_column
 5596                        annotation_field_column = map_columns.get(
 5597                            annotation_field, "INFO"
 5598                        )
 5599
 5600                        # field new name, if parametered
 5601                        annotation_fields_new_name = annotation_fields.get(
 5602                            annotation_field, annotation_field
 5603                        )
 5604                        if not annotation_fields_new_name:
 5605                            annotation_fields_new_name = annotation_field
 5606
 5607                        # To annotate
 5608                        # force_update_annotation = True
 5609                        # force_append_annotation = True
 5610                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 5611                        if annotation_field in parquet_hdr_vcf_header_infos and (
 5612                            force_update_annotation
 5613                            or force_append_annotation
 5614                            or (
 5615                                annotation_fields_new_name
 5616                                not in self.get_header().infos
 5617                            )
 5618                        ):
 5619
 5620                            # Add field to annotation to process list
 5621                            annotation_fields_processed.append(
 5622                                annotation_fields_new_name
 5623                            )
 5624
 5625                            # explode infos for the field
 5626                            annotation_fields_new_name_info_msg = ""
 5627                            if (
 5628                                force_update_annotation
 5629                                and annotation_fields_new_name
 5630                                in self.get_header().infos
 5631                            ):
 5632                                # Remove field from INFO
 5633                                query = f"""
 5634                                    UPDATE {table_variants} as table_variants
 5635                                    SET INFO = REGEXP_REPLACE(
 5636                                                concat(table_variants.INFO,''),
 5637                                                ';*{annotation_fields_new_name}=[^;]*',
 5638                                                ''
 5639                                                )
 5640                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 5641                                """
 5642                                annotation_fields_new_name_info_msg = " [update]"
 5643                                query_dict_remove[
 5644                                    f"remove 'INFO/{annotation_fields_new_name}'"
 5645                                ] = query
 5646
 5647                            # Sep between fields in INFO
 5648                            nb_annotation_field += 1
 5649                            if nb_annotation_field > 1:
 5650                                annotation_field_sep = ";"
 5651                            else:
 5652                                annotation_field_sep = ""
 5653
 5654                            log.info(
 5655                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 5656                            )
 5657
 5658                            # Add INFO field to header
 5659                            parquet_hdr_vcf_header_infos_number = (
 5660                                parquet_hdr_vcf_header_infos[annotation_field].num
 5661                                or "."
 5662                            )
 5663                            parquet_hdr_vcf_header_infos_type = (
 5664                                parquet_hdr_vcf_header_infos[annotation_field].type
 5665                                or "String"
 5666                            )
 5667                            parquet_hdr_vcf_header_infos_description = (
 5668                                parquet_hdr_vcf_header_infos[annotation_field].desc
 5669                                or f"{annotation_field} description"
 5670                            )
 5671                            parquet_hdr_vcf_header_infos_source = (
 5672                                parquet_hdr_vcf_header_infos[annotation_field].source
 5673                                or "unknown"
 5674                            )
 5675                            parquet_hdr_vcf_header_infos_version = (
 5676                                parquet_hdr_vcf_header_infos[annotation_field].version
 5677                                or "unknown"
 5678                            )
 5679
 5680                            vcf_reader.infos[annotation_fields_new_name] = (
 5681                                vcf.parser._Info(
 5682                                    annotation_fields_new_name,
 5683                                    parquet_hdr_vcf_header_infos_number,
 5684                                    parquet_hdr_vcf_header_infos_type,
 5685                                    parquet_hdr_vcf_header_infos_description,
 5686                                    parquet_hdr_vcf_header_infos_source,
 5687                                    parquet_hdr_vcf_header_infos_version,
 5688                                    self.code_type_map[
 5689                                        parquet_hdr_vcf_header_infos_type
 5690                                    ],
 5691                                )
 5692                            )
 5693
 5694                            # Append
 5695                            if force_append_annotation:
 5696                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 5697                            else:
 5698                                query_case_when_append = ""
 5699
 5700                            # Annotation/Update query fields
 5701                            # Found in INFO column
 5702                            if (
 5703                                annotation_field_column == "INFO"
 5704                                and "INFO" in parquet_hdr_vcf_header_columns
 5705                            ):
 5706                                sql_query_annotation_update_info_sets.append(
 5707                                    f"""
 5708                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 5709                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 5710                                        ELSE ''
 5711                                    END
 5712                                """
 5713                                )
 5714                            # Found in a specific column
 5715                            else:
 5716                                sql_query_annotation_update_info_sets.append(
 5717                                    f"""
 5718                                CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
 5719                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ','))
 5720                                        ELSE ''
 5721                                    END
 5722                                """
 5723                                )
 5724                                sql_query_annotation_to_agregate.append(
 5725                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 5726                                )
 5727
 5728                        # Not to annotate
 5729                        else:
 5730
 5731                            if force_update_annotation:
 5732                                annotation_message = "forced"
 5733                            else:
 5734                                annotation_message = "skipped"
 5735
 5736                            if annotation_field not in parquet_hdr_vcf_header_infos:
 5737                                log.warning(
 5738                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 5739                                )
 5740                            if annotation_fields_new_name in self.get_header().infos:
 5741                                log.warning(
 5742                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 5743                                )
 5744
 5745                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 5746                    # allow_annotation_full_info = True
 5747                    allow_annotation_full_info = not force_append_annotation
 5748
 5749                    if parquet_type in ["regions"]:
 5750                        allow_annotation_full_info = False
 5751
 5752                    if (
 5753                        allow_annotation_full_info
 5754                        and nb_annotation_field == len(annotation_fields)
 5755                        and annotation_fields_all
 5756                        and (
 5757                            "INFO" in parquet_hdr_vcf_header_columns
 5758                            and "INFO" in database.get_extra_columns()
 5759                        )
 5760                    ):
 5761                        log.debug("Column INFO annotation enabled")
 5762                        sql_query_annotation_update_info_sets = []
 5763                        sql_query_annotation_update_info_sets.append(
 5764                            f" table_parquet.INFO "
 5765                        )
 5766
 5767                    if sql_query_annotation_update_info_sets:
 5768
 5769                        # Annotate
 5770                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 5771
 5772                        # Join query annotation update info sets for SQL
 5773                        sql_query_annotation_update_info_sets_sql = ",".join(
 5774                            sql_query_annotation_update_info_sets
 5775                        )
 5776
 5777                        # Check chromosomes list (and variants infos)
 5778                        sql_query_chromosomes = f"""
 5779                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 5780                            FROM {table_variants} as table_variants
 5781                            GROUP BY table_variants."#CHROM"
 5782                            ORDER BY table_variants."#CHROM"
 5783                            """
 5784                        sql_query_chromosomes_df = self.conn.execute(
 5785                            sql_query_chromosomes
 5786                        ).df()
 5787                        sql_query_chromosomes_dict = {
 5788                            entry["CHROM"]: {
 5789                                "count": entry["count_variants"],
 5790                                "min": entry["min_variants"],
 5791                                "max": entry["max_variants"],
 5792                            }
 5793                            for index, entry in sql_query_chromosomes_df.iterrows()
 5794                        }
 5795
 5796                        # Init
 5797                        nb_of_query = 0
 5798                        nb_of_variant_annotated = 0
 5799                        query_dict = query_dict_remove
 5800
 5801                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 5802                        for chrom in sql_query_chromosomes_dict:
 5803
 5804                            # Number of variant by chromosome
 5805                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 5806                                chrom, {}
 5807                            ).get("count", 0)
 5808
 5809                            log.debug(
 5810                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 5811                            )
 5812
 5813                            # Annotation with regions database
 5814                            if parquet_type in ["regions"]:
 5815                                sql_query_annotation_from_clause = f"""
 5816                                    FROM (
 5817                                        SELECT 
 5818                                            '{chrom}' AS \"#CHROM\",
 5819                                            table_variants_from.\"POS\" AS \"POS\",
 5820                                            {",".join(sql_query_annotation_to_agregate)}
 5821                                        FROM {table_variants} as table_variants_from
 5822                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 5823                                            table_parquet_from."#CHROM" = '{chrom}'
 5824                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 5825                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
 5826                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 5827                                                )
 5828                                        )
 5829                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 5830                                        GROUP BY table_variants_from.\"POS\"
 5831                                        )
 5832                                        as table_parquet
 5833                                """
 5834
 5835                                sql_query_annotation_where_clause = """
 5836                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 5837                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 5838                                """
 5839
 5840                            # Annotation with variants database
 5841                            else:
 5842                                sql_query_annotation_from_clause = f"""
 5843                                    FROM {parquet_file_link} as table_parquet
 5844                                """
 5845                                sql_query_annotation_where_clause = f"""
 5846                                    table_variants."#CHROM" = '{chrom}'
 5847                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 5848                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 5849                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5850                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5851                                """
 5852
 5853                            # Create update query
 5854                            sql_query_annotation_chrom_interval_pos = f"""
 5855                                UPDATE {table_variants} as table_variants
 5856                                    SET INFO = 
 5857                                        concat(
 5858                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 5859                                                THEN table_variants.INFO
 5860                                                ELSE ''
 5861                                            END
 5862                                            ,
 5863                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 5864                                                        AND (
 5865                                                        concat({sql_query_annotation_update_info_sets_sql})
 5866                                                        )
 5867                                                        NOT IN ('','.') 
 5868                                                    THEN ';'
 5869                                                    ELSE ''
 5870                                            END
 5871                                            ,
 5872                                            {sql_query_annotation_update_info_sets_sql}
 5873                                            )
 5874                                    {sql_query_annotation_from_clause}
 5875                                    WHERE {sql_query_annotation_where_clause}
 5876                                    ;
 5877                                """
 5878
 5879                            # Add update query to dict
 5880                            query_dict[
 5881                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 5882                            ] = sql_query_annotation_chrom_interval_pos
 5883
 5884                        nb_of_query = len(query_dict)
 5885                        num_query = 0
 5886
 5887                        # SET max_expression_depth TO x
 5888                        self.conn.execute("SET max_expression_depth TO 10000")
 5889
 5890                        for query_name in query_dict:
 5891                            query = query_dict[query_name]
 5892                            num_query += 1
 5893                            log.info(
 5894                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 5895                            )
 5896                            result = self.conn.execute(query)
 5897                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 5898                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 5899                            log.info(
 5900                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 5901                            )
 5902
 5903                        log.info(
 5904                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 5905                        )
 5906
 5907                    else:
 5908
 5909                        log.info(
 5910                            f"Annotation '{annotation_name}' - No Annotations available"
 5911                        )
 5912
 5913                    log.debug("Final header: " + str(vcf_reader.infos))
 5914
 5915        # Remove added columns
 5916        for added_column in added_columns:
 5917            self.drop_column(column=added_column)
 5918
 5919    def annotation_splice(self, threads: int = None) -> None:
 5920        """
 5921        This function annotate with snpEff
 5922
 5923        :param threads: The number of threads to use
 5924        :return: the value of the variable "return_value".
 5925        """
 5926
 5927        # DEBUG
 5928        log.debug("Start annotation with splice tools")
 5929
 5930        # Threads
 5931        if not threads:
 5932            threads = self.get_threads()
 5933        log.debug("Threads: " + str(threads))
 5934
 5935        # DEBUG
 5936        delete_tmp = True
 5937        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5938            delete_tmp = False
 5939            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5940
 5941        # Config
 5942        config = self.get_config()
 5943        log.debug("Config: " + str(config))
 5944        splice_config = config.get("tools", {}).get("splice", {})
 5945        if not splice_config:
 5946            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 5947        if not splice_config:
 5948            msg_err = "No Splice tool config"
 5949            log.error(msg_err)
 5950            raise ValueError(msg_err)
 5951        log.debug(f"splice_config={splice_config}")
 5952
 5953        # Config - Folders - Databases
 5954        databases_folders = (
 5955            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 5956        )
 5957        log.debug("Databases annotations: " + str(databases_folders))
 5958
 5959        # Splice docker image
 5960        splice_docker_image = splice_config.get("docker").get("image")
 5961
 5962        # Pull splice image if it's not already there
 5963        if not check_docker_image_exists(splice_docker_image):
 5964            log.warning(
 5965                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 5966            )
 5967            try:
 5968                command(f"docker pull {splice_config.get('docker').get('image')}")
 5969            except subprocess.CalledProcessError:
 5970                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 5971                log.error(msg_err)
 5972                raise ValueError(msg_err)
 5973                return None
 5974
 5975        # Config - splice databases
 5976        splice_databases = (
 5977            config.get("folders", {})
 5978            .get("databases", {})
 5979            .get("splice", DEFAULT_SPLICE_FOLDER)
 5980        )
 5981        splice_databases = full_path(splice_databases)
 5982
 5983        # Param
 5984        param = self.get_param()
 5985        log.debug("Param: " + str(param))
 5986
 5987        # Param
 5988        options = param.get("annotation", {}).get("splice", {})
 5989        log.debug("Options: " + str(options))
 5990
 5991        # Data
 5992        table_variants = self.get_table_variants()
 5993
 5994        # Check if not empty
 5995        log.debug("Check if not empty")
 5996        sql_query_chromosomes = (
 5997            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5998        )
 5999        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6000            log.info("VCF empty")
 6001            return None
 6002
 6003        # Export in VCF
 6004        log.debug("Create initial file to annotate")
 6005
 6006        # Create output folder
 6007        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6008        if not os.path.exists(output_folder):
 6009            Path(output_folder).mkdir(parents=True, exist_ok=True)
 6010
 6011        # Create tmp VCF file
 6012        tmp_vcf = NamedTemporaryFile(
 6013            prefix=self.get_prefix(),
 6014            dir=output_folder,
 6015            suffix=".vcf",
 6016            delete=False,
 6017        )
 6018        tmp_vcf_name = tmp_vcf.name
 6019
 6020        # VCF header
 6021        header = self.get_header()
 6022
 6023        # Existing annotations
 6024        for vcf_annotation in self.get_header().infos:
 6025
 6026            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6027            log.debug(
 6028                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6029            )
 6030
 6031        # Memory limit
 6032        if config.get("memory", None):
 6033            memory_limit = config.get("memory", "8G").upper()
 6034            # upper()
 6035        else:
 6036            memory_limit = "8G"
 6037        log.debug(f"memory_limit: {memory_limit}")
 6038
 6039        # Check number of variants to annotate
 6040        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6041        where_clause_regex_spip = r"SPiP_\w+"
 6042        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6043        df_list_of_variants_to_annotate = self.get_query_to_df(
 6044            query=f""" SELECT * FROM variants {where_clause} """
 6045        )
 6046        if len(df_list_of_variants_to_annotate) == 0:
 6047            log.warning(
 6048                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6049            )
 6050            return None
 6051        else:
 6052            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6053
 6054        # Export VCF file
 6055        self.export_variant_vcf(
 6056            vcf_file=tmp_vcf_name,
 6057            remove_info=True,
 6058            add_samples=True,
 6059            index=False,
 6060            where_clause=where_clause,
 6061        )
 6062
 6063        # Create docker container and launch splice analysis
 6064        if splice_config:
 6065
 6066            # Splice mount folders
 6067            mount_folders = splice_config.get("mount", {})
 6068
 6069            # Genome mount
 6070            mount_folders[
 6071                config.get("folders", {})
 6072                .get("databases", {})
 6073                .get("genomes", DEFAULT_GENOME_FOLDER)
 6074            ] = "ro"
 6075
 6076            # SpliceAI mount
 6077            mount_folders[
 6078                config.get("folders", {})
 6079                .get("databases", {})
 6080                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
 6081            ] = "ro"
 6082
 6083            # Genome mount
 6084            mount_folders[
 6085                config.get("folders", {})
 6086                .get("databases", {})
 6087                .get("spip", DEFAULT_SPIP_FOLDER)
 6088            ] = "ro"
 6089
 6090            # Mount folders
 6091            mount = []
 6092
 6093            # Config mount
 6094            mount = [
 6095                f"-v {full_path(path)}:{full_path(path)}:{mode}"
 6096                for path, mode in mount_folders.items()
 6097            ]
 6098
 6099            if any(value for value in splice_config.values() if value is None):
 6100                log.warning("At least one splice config parameter is empty")
 6101                return None
 6102
 6103            # Params in splice nf
 6104            def check_values(dico: dict):
 6105                """
 6106                Ensure parameters for NF splice pipeline
 6107                """
 6108                for key, val in dico.items():
 6109                    if key == "genome":
 6110                        if any(
 6111                            assemb in options.get("genome", {})
 6112                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6113                        ):
 6114                            yield f"--{key} hg19"
 6115                        elif any(
 6116                            assemb in options.get("genome", {})
 6117                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6118                        ):
 6119                            yield f"--{key} hg38"
 6120                    elif (
 6121                        (isinstance(val, str) and val)
 6122                        or isinstance(val, int)
 6123                        or isinstance(val, bool)
 6124                    ):
 6125                        yield f"--{key} {val}"
 6126
 6127            # Genome
 6128            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6129            options["genome"] = genome
 6130
 6131            # NF params
 6132            nf_params = []
 6133
 6134            # Add options
 6135            if options:
 6136                nf_params = list(check_values(options))
 6137                log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6138            else:
 6139                log.debug("No NF params provided")
 6140
 6141            # Add threads
 6142            if "threads" not in options.keys():
 6143                nf_params.append(f"--threads {threads}")
 6144
 6145            # Genome path
 6146            genome_path = find_genome(
 6147                config.get("folders", {})
 6148                .get("databases", {})
 6149                .get("genomes", DEFAULT_GENOME_FOLDER),
 6150                file=f"{genome}.fa",
 6151            )
 6152            # Add genome path
 6153            if not genome_path:
 6154                raise ValueError(
 6155                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6156                )
 6157            else:
 6158                log.debug(f"Genome: {genome_path}")
 6159                nf_params.append(f"--genome_path {genome_path}")
 6160
 6161            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6162                """
 6163                Setting up updated databases for SPiP and SpliceAI
 6164                """
 6165
 6166                try:
 6167
 6168                    # SpliceAI assembly transcriptome
 6169                    spliceai_assembly = os.path.join(
 6170                        config.get("folders", {})
 6171                        .get("databases", {})
 6172                        .get("spliceai", {}),
 6173                        options.get("genome"),
 6174                        "transcriptome",
 6175                    )
 6176                    spip_assembly = options.get("genome")
 6177
 6178                    spip = find(
 6179                        f"transcriptome_{spip_assembly}.RData",
 6180                        config.get("folders", {}).get("databases", {}).get("spip", {}),
 6181                    )
 6182                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6183                    log.debug(f"SPiP annotations: {spip}")
 6184                    log.debug(f"SpliceAI annotations: {spliceai}")
 6185                    if spip and spliceai:
 6186                        return [
 6187                            f"--spip_transcriptome {spip}",
 6188                            f"--spliceai_annotations {spliceai}",
 6189                        ]
 6190                    else:
 6191                        # TODO crash and go on with basic annotations ?
 6192                        # raise ValueError(
 6193                        #     "Can't find splice databases in configuration EXIT"
 6194                        # )
 6195                        log.warning(
 6196                            "Can't find splice databases in configuration, use annotations file from image"
 6197                        )
 6198                except TypeError:
 6199                    log.warning(
 6200                        "Can't find splice databases in configuration, use annotations file from image"
 6201                    )
 6202                    return []
 6203
 6204            # Add options, check if transcriptome option have already beend provided
 6205            if (
 6206                "spip_transcriptome" not in nf_params
 6207                and "spliceai_transcriptome" not in nf_params
 6208            ):
 6209                splice_reference = splice_annotations(options, config)
 6210                if splice_reference:
 6211                    nf_params.extend(splice_reference)
 6212
 6213            nf_params.append(f"--output_folder {output_folder}")
 6214
 6215            random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6216            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6217            log.debug(cmd)
 6218
 6219            splice_config["docker"]["command"] = cmd
 6220
 6221            docker_cmd = get_bin_command(
 6222                tool="splice",
 6223                bin_type="docker",
 6224                config=config,
 6225                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6226                add_options=f"--name {random_uuid} {' '.join(mount)}",
 6227            )
 6228
 6229            # Docker debug
 6230            # if splice_config.get("rm_container"):
 6231            #     rm_container = "--rm"
 6232            # else:
 6233            #     rm_container = ""
 6234            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6235
 6236            log.debug(docker_cmd)
 6237            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6238            log.debug(res.stdout)
 6239            if res.stderr:
 6240                log.error(res.stderr)
 6241            res.check_returncode()
 6242        else:
 6243            log.warning(f"Splice tool configuration not found: {config}")
 6244
 6245        # Update variants
 6246        log.info("Annotation - Updating...")
 6247        # Test find output vcf
 6248        log.debug(
 6249            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6250        )
 6251        output_vcf = []
 6252        # Wrong folder to look in
 6253        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6254            if (
 6255                files
 6256                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6257            ):
 6258                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6259        # log.debug(os.listdir(options.get("output_folder")))
 6260        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6261        if not output_vcf:
 6262            log.debug(
 6263                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6264            )
 6265        else:
 6266            # Get new header from annotated vcf
 6267            log.debug(f"Initial header: {len(header.infos)} fields")
 6268            # Create new header with splice infos
 6269            new_vcf = Variants(input=output_vcf[0])
 6270            new_vcf_header = new_vcf.get_header().infos
 6271            for keys, infos in new_vcf_header.items():
 6272                if keys not in header.infos.keys():
 6273                    header.infos[keys] = infos
 6274            log.debug(f"New header: {len(header.infos)} fields")
 6275            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6276            self.update_from_vcf(output_vcf[0])
 6277
 6278        # Remove folder
 6279        remove_if_exists(output_folder)
 6280
 6281    ###
 6282    # Prioritization
 6283    ###
 6284
 6285    def get_config_default(self, name: str) -> dict:
 6286        """
 6287        The function `get_config_default` returns a dictionary containing default configurations for
 6288        various calculations and prioritizations.
 6289
 6290        :param name: The `get_config_default` function returns a dictionary containing default
 6291        configurations for different calculations and prioritizations. The `name` parameter is used to
 6292        specify which specific configuration to retrieve from the dictionary
 6293        :type name: str
 6294        :return: The function `get_config_default` returns a dictionary containing default configuration
 6295        settings for different calculations and prioritizations. The specific configuration settings are
 6296        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6297        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6298        returned. If there is no match, an empty dictionary is returned.
 6299        """
 6300
 6301        config_default = {
 6302            "calculations": {
 6303                "variant_chr_pos_alt_ref": {
 6304                    "type": "sql",
 6305                    "name": "variant_chr_pos_alt_ref",
 6306                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6307                    "available": False,
 6308                    "output_column_name": "variant_chr_pos_alt_ref",
 6309                    "output_column_type": "String",
 6310                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6311                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6312                    "operation_info": True,
 6313                },
 6314                "VARTYPE": {
 6315                    "type": "sql",
 6316                    "name": "VARTYPE",
 6317                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6318                    "available": True,
 6319                    "output_column_name": "VARTYPE",
 6320                    "output_column_type": "String",
 6321                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6322                    "operation_query": """
 6323                            CASE
 6324                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6325                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6326                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6327                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6328                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6329                                ELSE 'UNDEFINED'
 6330                            END
 6331                            """,
 6332                    "info_fields": ["SVTYPE"],
 6333                    "operation_info": True,
 6334                },
 6335                "snpeff_hgvs": {
 6336                    "type": "python",
 6337                    "name": "snpeff_hgvs",
 6338                    "description": "HGVS nomenclatures from snpEff annotation",
 6339                    "available": True,
 6340                    "function_name": "calculation_extract_snpeff_hgvs",
 6341                    "function_params": ["snpeff_hgvs", "ANN"],
 6342                },
 6343                "snpeff_ann_explode": {
 6344                    "type": "python",
 6345                    "name": "snpeff_ann_explode",
 6346                    "description": "Explode snpEff annotations with uniquify values",
 6347                    "available": True,
 6348                    "function_name": "calculation_snpeff_ann_explode",
 6349                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6350                },
 6351                "snpeff_ann_explode_uniquify": {
 6352                    "type": "python",
 6353                    "name": "snpeff_ann_explode_uniquify",
 6354                    "description": "Explode snpEff annotations",
 6355                    "available": True,
 6356                    "function_name": "calculation_snpeff_ann_explode",
 6357                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6358                },
 6359                "snpeff_ann_explode_json": {
 6360                    "type": "python",
 6361                    "name": "snpeff_ann_explode_json",
 6362                    "description": "Explode snpEff annotations in JSON format",
 6363                    "available": True,
 6364                    "function_name": "calculation_snpeff_ann_explode",
 6365                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6366                },
 6367                "NOMEN": {
 6368                    "type": "python",
 6369                    "name": "NOMEN",
 6370                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
 6371                    "available": True,
 6372                    "function_name": "calculation_extract_nomen",
 6373                    "function_params": [],
 6374                },
 6375                "FINDBYPIPELINE": {
 6376                    "type": "python",
 6377                    "name": "FINDBYPIPELINE",
 6378                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6379                    "available": True,
 6380                    "function_name": "calculation_find_by_pipeline",
 6381                    "function_params": ["findbypipeline"],
 6382                },
 6383                "FINDBYSAMPLE": {
 6384                    "type": "python",
 6385                    "name": "FINDBYSAMPLE",
 6386                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6387                    "available": True,
 6388                    "function_name": "calculation_find_by_pipeline",
 6389                    "function_params": ["findbysample"],
 6390                },
 6391                "GENOTYPECONCORDANCE": {
 6392                    "type": "python",
 6393                    "name": "GENOTYPECONCORDANCE",
 6394                    "description": "Concordance of genotype for multi caller VCF",
 6395                    "available": True,
 6396                    "function_name": "calculation_genotype_concordance",
 6397                    "function_params": [],
 6398                },
 6399                "BARCODE": {
 6400                    "type": "python",
 6401                    "name": "BARCODE",
 6402                    "description": "BARCODE as VaRank tool",
 6403                    "available": True,
 6404                    "function_name": "calculation_barcode",
 6405                    "function_params": [],
 6406                },
 6407                "BARCODEFAMILY": {
 6408                    "type": "python",
 6409                    "name": "BARCODEFAMILY",
 6410                    "description": "BARCODEFAMILY as VaRank tool",
 6411                    "available": True,
 6412                    "function_name": "calculation_barcode_family",
 6413                    "function_params": ["BCF"],
 6414                },
 6415                "TRIO": {
 6416                    "type": "python",
 6417                    "name": "TRIO",
 6418                    "description": "Inheritance for a trio family",
 6419                    "available": True,
 6420                    "function_name": "calculation_trio",
 6421                    "function_params": [],
 6422                },
 6423                "VAF": {
 6424                    "type": "python",
 6425                    "name": "VAF",
 6426                    "description": "Variant Allele Frequency (VAF) harmonization",
 6427                    "available": True,
 6428                    "function_name": "calculation_vaf_normalization",
 6429                    "function_params": [],
 6430                },
 6431                "VAF_stats": {
 6432                    "type": "python",
 6433                    "name": "VAF_stats",
 6434                    "description": "Variant Allele Frequency (VAF) statistics",
 6435                    "available": True,
 6436                    "function_name": "calculation_genotype_stats",
 6437                    "function_params": ["VAF"],
 6438                },
 6439                "DP_stats": {
 6440                    "type": "python",
 6441                    "name": "DP_stats",
 6442                    "description": "Depth (DP) statistics",
 6443                    "available": True,
 6444                    "function_name": "calculation_genotype_stats",
 6445                    "function_params": ["DP"],
 6446                },
 6447                "variant_id": {
 6448                    "type": "python",
 6449                    "name": "variant_id",
 6450                    "description": "Variant ID generated from variant position and type",
 6451                    "available": True,
 6452                    "function_name": "calculation_variant_id",
 6453                    "function_params": [],
 6454                },
 6455                "transcripts_json": {
 6456                    "type": "python",
 6457                    "name": "transcripts_json",
 6458                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6459                    "available": True,
 6460                    "function_name": "calculation_transcripts_annotation",
 6461                    "function_params": ["transcripts_json", None],
 6462                },
 6463                "transcripts_ann": {
 6464                    "type": "python",
 6465                    "name": "transcripts_ann",
 6466                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6467                    "available": True,
 6468                    "function_name": "calculation_transcripts_annotation",
 6469                    "function_params": [None, "transcripts_ann"],
 6470                },
 6471                "transcripts_annotations": {
 6472                    "type": "python",
 6473                    "name": "transcripts_annotations",
 6474                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6475                    "available": True,
 6476                    "function_name": "calculation_transcripts_annotation",
 6477                    "function_params": [None, None],
 6478                },
 6479                "transcripts_prioritization": {
 6480                    "type": "python",
 6481                    "name": "transcripts_prioritization",
 6482                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6483                    "available": True,
 6484                    "function_name": "calculation_transcripts_prioritization",
 6485                    "function_params": [],
 6486                },
 6487            },
 6488            "prioritizations": {
 6489                "default": {
 6490                    "filter": [
 6491                        {
 6492                            "type": "notequals",
 6493                            "value": "!PASS|\\.",
 6494                            "score": 0,
 6495                            "flag": "FILTERED",
 6496                            "comment": ["Bad variant quality"],
 6497                        },
 6498                        {
 6499                            "type": "equals",
 6500                            "value": "REJECT",
 6501                            "score": -20,
 6502                            "flag": "PASS",
 6503                            "comment": ["Bad variant quality"],
 6504                        },
 6505                    ],
 6506                    "DP": [
 6507                        {
 6508                            "type": "gte",
 6509                            "value": "50",
 6510                            "score": 5,
 6511                            "flag": "PASS",
 6512                            "comment": ["DP higher than 50"],
 6513                        }
 6514                    ],
 6515                    "ANN": [
 6516                        {
 6517                            "type": "contains",
 6518                            "value": "HIGH",
 6519                            "score": 5,
 6520                            "flag": "PASS",
 6521                            "comment": [
 6522                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6523                            ],
 6524                        },
 6525                        {
 6526                            "type": "contains",
 6527                            "value": "MODERATE",
 6528                            "score": 3,
 6529                            "flag": "PASS",
 6530                            "comment": [
 6531                                "A non-disruptive variant that might change protein effectiveness"
 6532                            ],
 6533                        },
 6534                        {
 6535                            "type": "contains",
 6536                            "value": "LOW",
 6537                            "score": 0,
 6538                            "flag": "FILTERED",
 6539                            "comment": [
 6540                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 6541                            ],
 6542                        },
 6543                        {
 6544                            "type": "contains",
 6545                            "value": "MODIFIER",
 6546                            "score": 0,
 6547                            "flag": "FILTERED",
 6548                            "comment": [
 6549                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 6550                            ],
 6551                        },
 6552                    ],
 6553                }
 6554            },
 6555        }
 6556
 6557        return config_default.get(name, None)
 6558
 6559    def get_config_json(
 6560        self, name: str, config_dict: dict = {}, config_file: str = None
 6561    ) -> dict:
 6562        """
 6563        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 6564        default values, a dictionary, and a file.
 6565
 6566        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 6567        the name of the configuration. It is used to identify and retrieve the configuration settings
 6568        for a specific component or module
 6569        :type name: str
 6570        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 6571        dictionary that allows you to provide additional configuration settings or overrides. When you
 6572        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 6573        the key is the configuration setting you want to override or
 6574        :type config_dict: dict
 6575        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 6576        specify the path to a configuration file that contains additional settings. If provided, the
 6577        function will read the contents of this file and update the configuration dictionary with the
 6578        values found in the file, overriding any existing values with the
 6579        :type config_file: str
 6580        :return: The function `get_config_json` returns a dictionary containing the configuration
 6581        settings.
 6582        """
 6583
 6584        # Create with default prioritizations
 6585        config_default = self.get_config_default(name=name)
 6586        configuration = config_default
 6587        # log.debug(f"configuration={configuration}")
 6588
 6589        # Replace prioritizations from dict
 6590        for config in config_dict:
 6591            configuration[config] = config_dict[config]
 6592
 6593        # Replace prioritizations from file
 6594        config_file = full_path(config_file)
 6595        if config_file:
 6596            if os.path.exists(config_file):
 6597                with open(config_file) as config_file_content:
 6598                    config_file_dict = json.load(config_file_content)
 6599                for config in config_file_dict:
 6600                    configuration[config] = config_file_dict[config]
 6601            else:
 6602                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 6603                log.error(msg_error)
 6604                raise ValueError(msg_error)
 6605
 6606        return configuration
 6607
 6608    def prioritization(
 6609        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 6610    ) -> bool:
 6611        """
 6612        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 6613        prioritizes variants based on configured profiles and criteria.
 6614
 6615        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 6616        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 6617        a table name is provided, the method will prioritize the variants in that specific table
 6618        :type table: str
 6619        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 6620        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 6621        provided, the code will use a default prefix value of "PZ"
 6622        :type pz_prefix: str
 6623        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 6624        additional parameters specific to the prioritization process. These parameters can include
 6625        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 6626        configurations needed for the prioritization of variants in a V
 6627        :type pz_param: dict
 6628        :return: A boolean value (True) is being returned from the `prioritization` function.
 6629        """
 6630
 6631        # Config
 6632        config = self.get_config()
 6633
 6634        # Param
 6635        param = self.get_param()
 6636
 6637        # Prioritization param
 6638        if pz_param is not None:
 6639            prioritization_param = pz_param
 6640        else:
 6641            prioritization_param = param.get("prioritization", {})
 6642
 6643        # Configuration profiles
 6644        prioritization_config_file = prioritization_param.get(
 6645            "prioritization_config", None
 6646        )
 6647        prioritization_config_file = full_path(prioritization_config_file)
 6648        prioritizations_config = self.get_config_json(
 6649            name="prioritizations", config_file=prioritization_config_file
 6650        )
 6651
 6652        # Prioritization prefix
 6653        pz_prefix_default = "PZ"
 6654        if pz_prefix is None:
 6655            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 6656
 6657        # Prioritization options
 6658        profiles = prioritization_param.get("profiles", [])
 6659        if isinstance(profiles, str):
 6660            profiles = profiles.split(",")
 6661        pzfields = prioritization_param.get(
 6662            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 6663        )
 6664        if isinstance(pzfields, str):
 6665            pzfields = pzfields.split(",")
 6666        default_profile = prioritization_param.get("default_profile", None)
 6667        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 6668        prioritization_score_mode = prioritization_param.get(
 6669            "prioritization_score_mode", "HOWARD"
 6670        )
 6671
 6672        # Quick Prioritizations
 6673        prioritizations = param.get("prioritizations", None)
 6674        if prioritizations:
 6675            log.info("Quick Prioritization:")
 6676            for profile in prioritizations.split(","):
 6677                if profile not in profiles:
 6678                    profiles.append(profile)
 6679                    log.info(f"   {profile}")
 6680
 6681        # If profile "ALL" provided, all profiles in the config profiles
 6682        if "ALL" in profiles:
 6683            profiles = list(prioritizations_config.keys())
 6684
 6685        for profile in profiles:
 6686            if prioritizations_config.get(profile, None):
 6687                log.debug(f"Profile '{profile}' configured")
 6688            else:
 6689                msg_error = f"Profile '{profile}' NOT configured"
 6690                log.error(msg_error)
 6691                raise ValueError(msg_error)
 6692
 6693        if profiles:
 6694            log.info(f"Prioritization... ")
 6695        else:
 6696            log.debug(f"No profile defined")
 6697            return False
 6698
 6699        if not default_profile and len(profiles):
 6700            default_profile = profiles[0]
 6701
 6702        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 6703        log.debug("Profiles to check: " + str(list(profiles)))
 6704
 6705        # Variables
 6706        if table is not None:
 6707            table_variants = table
 6708        else:
 6709            table_variants = self.get_table_variants(clause="update")
 6710        log.debug(f"Table to prioritize: {table_variants}")
 6711
 6712        # Added columns
 6713        added_columns = []
 6714
 6715        # Create list of PZfields
 6716        # List of PZFields
 6717        list_of_pzfields_original = pzfields + [
 6718            pzfield + pzfields_sep + profile
 6719            for pzfield in pzfields
 6720            for profile in profiles
 6721        ]
 6722        list_of_pzfields = []
 6723        log.debug(f"{list_of_pzfields_original}")
 6724
 6725        # Remove existing PZfields to use if exists
 6726        for pzfield in list_of_pzfields_original:
 6727            if self.get_header().infos.get(pzfield, None) is None:
 6728                list_of_pzfields.append(pzfield)
 6729                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 6730            else:
 6731                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 6732
 6733        if list_of_pzfields:
 6734
 6735            # Explode Infos prefix
 6736            explode_infos_prefix = self.get_explode_infos_prefix()
 6737
 6738            # PZfields tags description
 6739            PZfields_INFOS = {
 6740                f"{pz_prefix}Tags": {
 6741                    "ID": f"{pz_prefix}Tags",
 6742                    "Number": ".",
 6743                    "Type": "String",
 6744                    "Description": "Variant tags based on annotation criteria",
 6745                },
 6746                f"{pz_prefix}Score": {
 6747                    "ID": f"{pz_prefix}Score",
 6748                    "Number": 1,
 6749                    "Type": "Integer",
 6750                    "Description": "Variant score based on annotation criteria",
 6751                },
 6752                f"{pz_prefix}Flag": {
 6753                    "ID": f"{pz_prefix}Flag",
 6754                    "Number": 1,
 6755                    "Type": "String",
 6756                    "Description": "Variant flag based on annotation criteria",
 6757                },
 6758                f"{pz_prefix}Comment": {
 6759                    "ID": f"{pz_prefix}Comment",
 6760                    "Number": ".",
 6761                    "Type": "String",
 6762                    "Description": "Variant comment based on annotation criteria",
 6763                },
 6764                f"{pz_prefix}Infos": {
 6765                    "ID": f"{pz_prefix}Infos",
 6766                    "Number": ".",
 6767                    "Type": "String",
 6768                    "Description": "Variant infos based on annotation criteria",
 6769                },
 6770            }
 6771
 6772            # Create INFO fields if not exist
 6773            for field in PZfields_INFOS:
 6774                field_ID = PZfields_INFOS[field]["ID"]
 6775                field_description = PZfields_INFOS[field]["Description"]
 6776                if field_ID not in self.get_header().infos and field_ID in pzfields:
 6777                    field_description = (
 6778                        PZfields_INFOS[field]["Description"]
 6779                        + f", profile {default_profile}"
 6780                    )
 6781                    self.get_header().infos[field_ID] = vcf.parser._Info(
 6782                        field_ID,
 6783                        PZfields_INFOS[field]["Number"],
 6784                        PZfields_INFOS[field]["Type"],
 6785                        field_description,
 6786                        "unknown",
 6787                        "unknown",
 6788                        code_type_map[PZfields_INFOS[field]["Type"]],
 6789                    )
 6790
 6791            # Create INFO fields if not exist for each profile
 6792            for profile in prioritizations_config:
 6793                if profile in profiles or profiles == []:
 6794                    for field in PZfields_INFOS:
 6795                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 6796                        field_description = (
 6797                            PZfields_INFOS[field]["Description"]
 6798                            + f", profile {profile}"
 6799                        )
 6800                        if (
 6801                            field_ID not in self.get_header().infos
 6802                            and field in pzfields
 6803                        ):
 6804                            self.get_header().infos[field_ID] = vcf.parser._Info(
 6805                                field_ID,
 6806                                PZfields_INFOS[field]["Number"],
 6807                                PZfields_INFOS[field]["Type"],
 6808                                field_description,
 6809                                "unknown",
 6810                                "unknown",
 6811                                code_type_map[PZfields_INFOS[field]["Type"]],
 6812                            )
 6813
 6814            # Header
 6815            for pzfield in list_of_pzfields:
 6816                if re.match(f"{pz_prefix}Score.*", pzfield):
 6817                    added_column = self.add_column(
 6818                        table_name=table_variants,
 6819                        column_name=pzfield,
 6820                        column_type="INTEGER",
 6821                        default_value="0",
 6822                    )
 6823                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 6824                    added_column = self.add_column(
 6825                        table_name=table_variants,
 6826                        column_name=pzfield,
 6827                        column_type="BOOLEAN",
 6828                        default_value="1",
 6829                    )
 6830                else:
 6831                    added_column = self.add_column(
 6832                        table_name=table_variants,
 6833                        column_name=pzfield,
 6834                        column_type="STRING",
 6835                        default_value="''",
 6836                    )
 6837                added_columns.append(added_column)
 6838
 6839            # Profiles
 6840            if profiles:
 6841
 6842                # foreach profile in configuration file
 6843                for profile in prioritizations_config:
 6844
 6845                    # If profile is asked in param, or ALL are asked (empty profile [])
 6846                    if profile in profiles or profiles == []:
 6847                        log.info(f"Profile '{profile}'")
 6848
 6849                        sql_set_info_option = ""
 6850
 6851                        sql_set_info = []
 6852
 6853                        # PZ fields set
 6854
 6855                        # PZScore
 6856                        if (
 6857                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 6858                            in list_of_pzfields
 6859                        ):
 6860                            sql_set_info.append(
 6861                                f"""
 6862                                    concat(
 6863                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 6864                                        {pz_prefix}Score{pzfields_sep}{profile}
 6865                                    ) 
 6866                                """
 6867                            )
 6868                            if (
 6869                                profile == default_profile
 6870                                and f"{pz_prefix}Score" in list_of_pzfields
 6871                            ):
 6872                                sql_set_info.append(
 6873                                    f"""
 6874                                        concat(
 6875                                            '{pz_prefix}Score=',
 6876                                            {pz_prefix}Score{pzfields_sep}{profile}
 6877                                        )
 6878                                    """
 6879                                )
 6880
 6881                        # PZFlag
 6882                        if (
 6883                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 6884                            in list_of_pzfields
 6885                        ):
 6886                            sql_set_info.append(
 6887                                f"""
 6888                                    concat(
 6889                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 6890                                        CASE 
 6891                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 6892                                            THEN 'PASS'
 6893                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 6894                                            THEN 'FILTERED'
 6895                                        END
 6896                                    ) 
 6897                                """
 6898                            )
 6899                            if (
 6900                                profile == default_profile
 6901                                and f"{pz_prefix}Flag" in list_of_pzfields
 6902                            ):
 6903                                sql_set_info.append(
 6904                                    f"""
 6905                                        concat(
 6906                                            '{pz_prefix}Flag=',
 6907                                            CASE 
 6908                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 6909                                                THEN 'PASS'
 6910                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 6911                                                THEN 'FILTERED'
 6912                                            END
 6913                                        )
 6914                                    """
 6915                                )
 6916
 6917                        # PZComment
 6918                        if (
 6919                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 6920                            in list_of_pzfields
 6921                        ):
 6922                            sql_set_info.append(
 6923                                f"""
 6924                                    CASE
 6925                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 6926                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 6927                                        ELSE ''
 6928                                    END
 6929                                """
 6930                            )
 6931                            if (
 6932                                profile == default_profile
 6933                                and f"{pz_prefix}Comment" in list_of_pzfields
 6934                            ):
 6935                                sql_set_info.append(
 6936                                    f"""
 6937                                        CASE
 6938                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 6939                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 6940                                            ELSE ''
 6941                                        END
 6942                                    """
 6943                                )
 6944
 6945                        # PZInfos
 6946                        if (
 6947                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 6948                            in list_of_pzfields
 6949                        ):
 6950                            sql_set_info.append(
 6951                                f"""
 6952                                    CASE
 6953                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 6954                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 6955                                        ELSE ''
 6956                                    END
 6957                                """
 6958                            )
 6959                            if (
 6960                                profile == default_profile
 6961                                and f"{pz_prefix}Infos" in list_of_pzfields
 6962                            ):
 6963                                sql_set_info.append(
 6964                                    f"""
 6965                                        CASE
 6966                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 6967                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 6968                                            ELSE ''
 6969                                        END
 6970                                    """
 6971                                )
 6972
 6973                        # Merge PZfields
 6974                        sql_set_info_option = ""
 6975                        sql_set_sep = ""
 6976                        for sql_set in sql_set_info:
 6977                            if sql_set_sep:
 6978                                sql_set_info_option += f"""
 6979                                    , concat('{sql_set_sep}', {sql_set})
 6980                                """
 6981                            else:
 6982                                sql_set_info_option += f"""
 6983                                    , {sql_set}
 6984                                """
 6985                            sql_set_sep = ";"
 6986
 6987                        sql_queries = []
 6988                        for annotation in prioritizations_config[profile]:
 6989
 6990                            # Explode specific annotation
 6991                            log.debug(f"Explode annotation '{annotation}'")
 6992                            added_columns += self.explode_infos(
 6993                                prefix=explode_infos_prefix,
 6994                                fields=[annotation],
 6995                                table=table_variants,
 6996                            )
 6997                            extra_infos = self.get_extra_infos(table=table_variants)
 6998
 6999                            # Check if annotation field is present
 7000                            if not f"{explode_infos_prefix}{annotation}" in extra_infos:
 7001                                log.debug(f"Annotation '{annotation}' not in data")
 7002                                continue
 7003                            else:
 7004                                log.debug(f"Annotation '{annotation}' in data")
 7005
 7006                            # For each criterions
 7007                            for criterion in prioritizations_config[profile][
 7008                                annotation
 7009                            ]:
 7010                                criterion_type = criterion["type"]
 7011                                criterion_value = criterion["value"]
 7012                                criterion_score = criterion.get("score", 0)
 7013                                criterion_flag = criterion.get("flag", "PASS")
 7014                                criterion_flag_bool = criterion_flag == "PASS"
 7015                                criterion_comment = (
 7016                                    ", ".join(criterion.get("comment", []))
 7017                                    .replace("'", "''")
 7018                                    .replace(";", ",")
 7019                                    .replace("\t", " ")
 7020                                )
 7021                                criterion_infos = (
 7022                                    str(criterion)
 7023                                    .replace("'", "''")
 7024                                    .replace(";", ",")
 7025                                    .replace("\t", " ")
 7026                                )
 7027
 7028                                sql_set = []
 7029                                sql_set_info = []
 7030
 7031                                # PZ fields set
 7032                                if (
 7033                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7034                                    in list_of_pzfields
 7035                                ):
 7036                                    if prioritization_score_mode == "HOWARD":
 7037                                        sql_set.append(
 7038                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7039                                        )
 7040                                    elif prioritization_score_mode == "VaRank":
 7041                                        sql_set.append(
 7042                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
 7043                                        )
 7044                                    else:
 7045                                        sql_set.append(
 7046                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7047                                        )
 7048                                if (
 7049                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7050                                    in list_of_pzfields
 7051                                ):
 7052                                    sql_set.append(
 7053                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7054                                    )
 7055                                if (
 7056                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7057                                    in list_of_pzfields
 7058                                ):
 7059                                    sql_set.append(
 7060                                        f"""
 7061                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7062                                                concat(
 7063                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7064                                                    CASE 
 7065                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7066                                                        THEN ', '
 7067                                                        ELSE ''
 7068                                                    END,
 7069                                                    '{criterion_comment}'
 7070                                                )
 7071                                        """
 7072                                    )
 7073                                if (
 7074                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7075                                    in list_of_pzfields
 7076                                ):
 7077                                    sql_set.append(
 7078                                        f"""
 7079                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7080                                                concat(
 7081                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7082                                                    '{criterion_infos}'
 7083                                                )
 7084                                        """
 7085                                    )
 7086                                sql_set_option = ",".join(sql_set)
 7087
 7088                                # Criterion and comparison
 7089                                if sql_set_option:
 7090                                    try:
 7091                                        float(criterion_value)
 7092                                        sql_update = f"""
 7093                                            UPDATE {table_variants}
 7094                                            SET {sql_set_option}
 7095                                            WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7096                                            AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7097                                            """
 7098                                    except:
 7099                                        contains_option = ""
 7100                                        if criterion_type == "contains":
 7101                                            contains_option = ".*"
 7102                                        sql_update = f"""
 7103                                            UPDATE {table_variants}
 7104                                            SET {sql_set_option}
 7105                                            WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7106                                            """
 7107                                    sql_queries.append(sql_update)
 7108                                else:
 7109                                    log.warning(
 7110                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7111                                    )
 7112
 7113                        # PZTags
 7114                        if (
 7115                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7116                            in list_of_pzfields
 7117                        ):
 7118
 7119                            # Create PZFalgs value
 7120                            pztags_value = ""
 7121                            pztags_sep_default = "|"
 7122                            pztags_sep = ""
 7123                            for pzfield in pzfields:
 7124                                if pzfield not in [f"{pz_prefix}Tags"]:
 7125                                    if (
 7126                                        f"{pzfield}{pzfields_sep}{profile}"
 7127                                        in list_of_pzfields
 7128                                    ):
 7129                                        if pzfield in [f"{pz_prefix}Flag"]:
 7130                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7131                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7132                                                    THEN 'PASS'
 7133                                                    ELSE 'FILTERED'
 7134                                                END, '"""
 7135                                        else:
 7136                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7137                                        pztags_sep = pztags_sep_default
 7138
 7139                            # Add Query update for PZFlags
 7140                            sql_update_pztags = f"""
 7141                                UPDATE {table_variants}
 7142                                SET INFO = concat(
 7143                                        INFO,
 7144                                        CASE WHEN INFO NOT in ('','.')
 7145                                                THEN ';'
 7146                                                ELSE ''
 7147                                        END,
 7148                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7149                                    )
 7150                                """
 7151                            sql_queries.append(sql_update_pztags)
 7152
 7153                            # Add Query update for PZFlags for default
 7154                            if profile == default_profile:
 7155                                sql_update_pztags_default = f"""
 7156                                UPDATE {table_variants}
 7157                                SET INFO = concat(
 7158                                        INFO,
 7159                                        ';',
 7160                                        '{pz_prefix}Tags={pztags_value}'
 7161                                    )
 7162                                """
 7163                                sql_queries.append(sql_update_pztags_default)
 7164
 7165                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7166
 7167                        if sql_queries:
 7168
 7169                            for sql_query in sql_queries:
 7170                                log.debug(
 7171                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7172                                )
 7173                                self.conn.execute(sql_query)
 7174
 7175                        log.info(f"""Profile '{profile}' - Update... """)
 7176                        sql_query_update = f"""
 7177                            UPDATE {table_variants}
 7178                            SET INFO =  
 7179                                concat(
 7180                                    CASE
 7181                                        WHEN INFO NOT IN ('','.')
 7182                                        THEN concat(INFO, ';')
 7183                                        ELSE ''
 7184                                    END
 7185                                    {sql_set_info_option}
 7186                                )
 7187                        """
 7188                        self.conn.execute(sql_query_update)
 7189
 7190        else:
 7191
 7192            log.warning(f"No profiles in parameters")
 7193
 7194        # Remove added columns
 7195        for added_column in added_columns:
 7196            self.drop_column(column=added_column)
 7197
 7198        # Explode INFOS fields into table fields
 7199        if self.get_explode_infos():
 7200            self.explode_infos(
 7201                prefix=self.get_explode_infos_prefix(),
 7202                fields=self.get_explode_infos_fields(),
 7203                force=True,
 7204            )
 7205
 7206        return True
 7207
 7208    ###
 7209    # HGVS
 7210    ###
 7211
 7212    def annotation_hgvs(self, threads: int = None) -> None:
 7213        """
 7214        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7215        coordinates and alleles.
 7216
 7217        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7218        threads to use for parallel processing. If no value is provided, it will default to the number
 7219        of threads obtained from the `get_threads()` method
 7220        :type threads: int
 7221        """
 7222
 7223        # Function for each partition of the Dask Dataframe
 7224        def partition_function(partition):
 7225            """
 7226            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7227            each row of a DataFrame called `partition`.
 7228
 7229            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7230            to be processed
 7231            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7232            the "partition" dataframe along the axis 1.
 7233            """
 7234            return partition.apply(annotation_hgvs_partition, axis=1)
 7235
 7236        def annotation_hgvs_partition(row) -> str:
 7237            """
 7238            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7239            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7240
 7241            :param row: A dictionary-like object that contains the values for the following keys:
 7242            :return: a string that contains the HGVS names associated with the given row of data.
 7243            """
 7244
 7245            chr = row["CHROM"]
 7246            pos = row["POS"]
 7247            ref = row["REF"]
 7248            alt = row["ALT"]
 7249
 7250            # Find list of associated transcripts
 7251            transcripts_list = list(
 7252                polars_conn.execute(
 7253                    f"""
 7254                SELECT transcript
 7255                FROM refseq_df
 7256                WHERE CHROM='{chr}'
 7257                AND POS={pos}
 7258            """
 7259                )["transcript"]
 7260            )
 7261
 7262            # Full HGVS annotation in list
 7263            hgvs_full_list = []
 7264
 7265            for transcript_name in transcripts_list:
 7266
 7267                # Transcript
 7268                transcript = get_transcript(
 7269                    transcripts=transcripts, transcript_name=transcript_name
 7270                )
 7271                # Exon
 7272                if use_exon:
 7273                    exon = transcript.find_exon_number(pos)
 7274                else:
 7275                    exon = None
 7276                # Protein
 7277                transcript_protein = None
 7278                if use_protein or add_protein or full_format:
 7279                    transcripts_protein = list(
 7280                        polars_conn.execute(
 7281                            f"""
 7282                        SELECT protein
 7283                        FROM refseqlink_df
 7284                        WHERE transcript='{transcript_name}'
 7285                        LIMIT 1
 7286                    """
 7287                        )["protein"]
 7288                    )
 7289                    if len(transcripts_protein):
 7290                        transcript_protein = transcripts_protein[0]
 7291
 7292                # HGVS name
 7293                hgvs_name = format_hgvs_name(
 7294                    chr,
 7295                    pos,
 7296                    ref,
 7297                    alt,
 7298                    genome=genome,
 7299                    transcript=transcript,
 7300                    transcript_protein=transcript_protein,
 7301                    exon=exon,
 7302                    use_gene=use_gene,
 7303                    use_protein=use_protein,
 7304                    full_format=full_format,
 7305                    use_version=use_version,
 7306                    codon_type=codon_type,
 7307                )
 7308                hgvs_full_list.append(hgvs_name)
 7309                if add_protein and not use_protein and not full_format:
 7310                    hgvs_name = format_hgvs_name(
 7311                        chr,
 7312                        pos,
 7313                        ref,
 7314                        alt,
 7315                        genome=genome,
 7316                        transcript=transcript,
 7317                        transcript_protein=transcript_protein,
 7318                        exon=exon,
 7319                        use_gene=use_gene,
 7320                        use_protein=True,
 7321                        full_format=False,
 7322                        use_version=use_version,
 7323                        codon_type=codon_type,
 7324                    )
 7325                    hgvs_full_list.append(hgvs_name)
 7326
 7327            # Create liste of HGVS annotations
 7328            hgvs_full = ",".join(hgvs_full_list)
 7329
 7330            return hgvs_full
 7331
 7332        # Polars connexion
 7333        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7334
 7335        # Config
 7336        config = self.get_config()
 7337
 7338        # Databases
 7339        # Genome
 7340        databases_genomes_folders = (
 7341            config.get("folders", {})
 7342            .get("databases", {})
 7343            .get("genomes", DEFAULT_GENOME_FOLDER)
 7344        )
 7345        databases_genome = (
 7346            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7347        )
 7348        # refseq database folder
 7349        databases_refseq_folders = (
 7350            config.get("folders", {})
 7351            .get("databases", {})
 7352            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7353        )
 7354        # refseq
 7355        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7356        # refSeqLink
 7357        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7358
 7359        # Param
 7360        param = self.get_param()
 7361
 7362        # Quick HGVS
 7363        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7364            log.info(f"Quick HGVS Annotation:")
 7365            if not param.get("hgvs", None):
 7366                param["hgvs"] = {}
 7367            for option in param.get("hgvs_options", "").split(","):
 7368                option_var_val = option.split("=")
 7369                option_var = option_var_val[0]
 7370                if len(option_var_val) > 1:
 7371                    option_val = option_var_val[1]
 7372                else:
 7373                    option_val = "True"
 7374                if option_val.upper() in ["TRUE"]:
 7375                    option_val = True
 7376                elif option_val.upper() in ["FALSE"]:
 7377                    option_val = False
 7378                log.info(f"   {option_var}={option_val}")
 7379                param["hgvs"][option_var] = option_val
 7380
 7381        # Check if HGVS annotation enabled
 7382        if "hgvs" in param:
 7383            log.info(f"HGVS Annotation... ")
 7384            for hgvs_option in param.get("hgvs", {}):
 7385                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7386        else:
 7387            return
 7388
 7389        # HGVS Param
 7390        param_hgvs = param.get("hgvs", {})
 7391        use_exon = param_hgvs.get("use_exon", False)
 7392        use_gene = param_hgvs.get("use_gene", False)
 7393        use_protein = param_hgvs.get("use_protein", False)
 7394        add_protein = param_hgvs.get("add_protein", False)
 7395        full_format = param_hgvs.get("full_format", False)
 7396        use_version = param_hgvs.get("use_version", False)
 7397        codon_type = param_hgvs.get("codon_type", "3")
 7398
 7399        # refSseq refSeqLink
 7400        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 7401        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 7402
 7403        # Assembly
 7404        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 7405
 7406        # Genome
 7407        genome_file = None
 7408        if find_genome(databases_genome):
 7409            genome_file = find_genome(databases_genome)
 7410        else:
 7411            genome_file = find_genome(
 7412                genome_path=databases_genomes_folders, assembly=assembly
 7413            )
 7414        log.debug("Genome: " + str(genome_file))
 7415
 7416        # refSseq
 7417        refseq_file = find_file_prefix(
 7418            input_file=databases_refseq,
 7419            prefix="ncbiRefSeq",
 7420            folder=databases_refseq_folders,
 7421            assembly=assembly,
 7422        )
 7423        log.debug("refSeq: " + str(refseq_file))
 7424
 7425        # refSeqLink
 7426        refseqlink_file = find_file_prefix(
 7427            input_file=databases_refseqlink,
 7428            prefix="ncbiRefSeqLink",
 7429            folder=databases_refseq_folders,
 7430            assembly=assembly,
 7431        )
 7432        log.debug("refSeqLink: " + str(refseqlink_file))
 7433
 7434        # Threads
 7435        if not threads:
 7436            threads = self.get_threads()
 7437        log.debug("Threads: " + str(threads))
 7438
 7439        # Variables
 7440        table_variants = self.get_table_variants(clause="update")
 7441
 7442        # Get variants SNV and InDel only
 7443        query_variants = f"""
 7444            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 7445            FROM {table_variants}
 7446            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 7447            """
 7448        df_variants = self.get_query_to_df(query_variants)
 7449
 7450        # Added columns
 7451        added_columns = []
 7452
 7453        # Add hgvs column in variants table
 7454        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 7455        added_column = self.add_column(
 7456            table_variants, hgvs_column_name, "STRING", default_value=None
 7457        )
 7458        added_columns.append(added_column)
 7459
 7460        log.debug(f"refSeq loading...")
 7461        # refSeq in duckDB
 7462        refseq_table = get_refseq_table(
 7463            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 7464        )
 7465        # Loading all refSeq in Dataframe
 7466        refseq_query = f"""
 7467            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 7468            FROM {refseq_table}
 7469            JOIN df_variants ON (
 7470                {refseq_table}.chrom = df_variants.CHROM
 7471                AND {refseq_table}.txStart<=df_variants.POS
 7472                AND {refseq_table}.txEnd>=df_variants.POS
 7473            )
 7474        """
 7475        refseq_df = self.conn.query(refseq_query).pl()
 7476
 7477        if refseqlink_file:
 7478            log.debug(f"refSeqLink loading...")
 7479            # refSeqLink in duckDB
 7480            refseqlink_table = get_refseq_table(
 7481                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 7482            )
 7483            # Loading all refSeqLink in Dataframe
 7484            protacc_column = "protAcc_with_ver"
 7485            mrnaacc_column = "mrnaAcc_with_ver"
 7486            refseqlink_query = f"""
 7487                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 7488                FROM {refseqlink_table} 
 7489                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 7490                WHERE protAcc_without_ver IS NOT NULL
 7491            """
 7492            # Polars Dataframe
 7493            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 7494
 7495        # Read RefSeq transcripts into a python dict/model.
 7496        log.debug(f"Transcripts loading...")
 7497        with tempfile.TemporaryDirectory() as tmpdir:
 7498            transcripts_query = f"""
 7499                COPY (
 7500                    SELECT {refseq_table}.*
 7501                    FROM {refseq_table}
 7502                    JOIN df_variants ON (
 7503                        {refseq_table}.chrom=df_variants.CHROM
 7504                        AND {refseq_table}.txStart<=df_variants.POS
 7505                        AND {refseq_table}.txEnd>=df_variants.POS
 7506                    )
 7507                )
 7508                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 7509            """
 7510            self.conn.query(transcripts_query)
 7511            with open(f"{tmpdir}/transcript.tsv") as infile:
 7512                transcripts = read_transcripts(infile)
 7513
 7514        # Polars connexion
 7515        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7516
 7517        log.debug("Genome loading...")
 7518        # Read genome sequence using pyfaidx.
 7519        genome = Fasta(genome_file)
 7520
 7521        log.debug("Start annotation HGVS...")
 7522
 7523        # Create
 7524        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 7525        ddf = dd.from_pandas(df_variants, npartitions=threads)
 7526
 7527        # Use dask.dataframe.apply() to apply function on each partition
 7528        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 7529
 7530        # Convert Dask DataFrame to Pandas Dataframe
 7531        df = ddf.compute()
 7532
 7533        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 7534        with tempfile.TemporaryDirectory() as tmpdir:
 7535            df_parquet = os.path.join(tmpdir, "df.parquet")
 7536            df.to_parquet(df_parquet)
 7537
 7538            # Update hgvs column
 7539            update_variant_query = f"""
 7540                UPDATE {table_variants}
 7541                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 7542                FROM read_parquet('{df_parquet}') as df
 7543                WHERE variants."#CHROM" = df.CHROM
 7544                AND variants.POS = df.POS
 7545                AND variants.REF = df.REF
 7546                AND variants.ALT = df.ALT
 7547                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 7548                """
 7549            self.execute_query(update_variant_query)
 7550
 7551        # Update INFO column
 7552        sql_query_update = f"""
 7553            UPDATE {table_variants}
 7554            SET INFO = 
 7555                concat(
 7556                    CASE 
 7557                        WHEN INFO NOT IN ('','.')
 7558                        THEN concat(INFO, ';')
 7559                        ELSE ''
 7560                    END,
 7561                    'hgvs=',
 7562                    {hgvs_column_name}
 7563                )
 7564            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 7565            """
 7566        self.execute_query(sql_query_update)
 7567
 7568        # Add header
 7569        HGVS_INFOS = {
 7570            "hgvs": {
 7571                "ID": "hgvs",
 7572                "Number": ".",
 7573                "Type": "String",
 7574                "Description": f"HGVS annotatation with HOWARD",
 7575            }
 7576        }
 7577
 7578        for field in HGVS_INFOS:
 7579            field_ID = HGVS_INFOS[field]["ID"]
 7580            field_description = HGVS_INFOS[field]["Description"]
 7581            self.get_header().infos[field_ID] = vcf.parser._Info(
 7582                field_ID,
 7583                HGVS_INFOS[field]["Number"],
 7584                HGVS_INFOS[field]["Type"],
 7585                field_description,
 7586                "unknown",
 7587                "unknown",
 7588                code_type_map[HGVS_INFOS[field]["Type"]],
 7589            )
 7590
 7591        # Remove added columns
 7592        for added_column in added_columns:
 7593            self.drop_column(column=added_column)
 7594
 7595    ###
 7596    # Calculation
 7597    ###
 7598
 7599    def get_operations_help(
 7600        self, operations_config_dict: dict = {}, operations_config_file: str = None
 7601    ) -> list:
 7602
 7603        # Init
 7604        operations_help = []
 7605
 7606        # operations
 7607        operations = self.get_config_json(
 7608            name="calculations",
 7609            config_dict=operations_config_dict,
 7610            config_file=operations_config_file,
 7611        )
 7612        for op in operations:
 7613            op_name = operations[op].get("name", op).upper()
 7614            op_description = operations[op].get("description", op_name)
 7615            op_available = operations[op].get("available", False)
 7616            if op_available:
 7617                operations_help.append(f"   {op_name}: {op_description}")
 7618
 7619        # Sort operations
 7620        operations_help.sort()
 7621
 7622        # insert header
 7623        operations_help.insert(0, "Available calculation operations:")
 7624
 7625        # Return
 7626        return operations_help
 7627
 7628    def calculation(
 7629        self,
 7630        operations: dict = {},
 7631        operations_config_dict: dict = {},
 7632        operations_config_file: str = None,
 7633    ) -> None:
 7634        """
 7635        It takes a list of operations, and for each operation, it checks if it's a python or sql
 7636        operation, and then calls the appropriate function
 7637
 7638        param json example:
 7639            "calculation": {
 7640                "NOMEN": {
 7641                    "options": {
 7642                        "hgvs_field": "hgvs"
 7643                    },
 7644                "middle" : null
 7645            }
 7646        """
 7647
 7648        # Param
 7649        param = self.get_param()
 7650
 7651        # operations config
 7652        operations_config = self.get_config_json(
 7653            name="calculations",
 7654            config_dict=operations_config_dict,
 7655            config_file=operations_config_file,
 7656        )
 7657
 7658        # Upper keys
 7659        operations_config = {k.upper(): v for k, v in operations_config.items()}
 7660
 7661        # Calculations
 7662
 7663        # Operations from param
 7664        operations = param.get("calculation", {}).get("calculations", operations)
 7665
 7666        # Quick calculation - add
 7667        if param.get("calculations", None):
 7668            calculations_list = [
 7669                value for value in param.get("calculations", "").split(",")
 7670            ]
 7671            log.info(f"Quick Calculations:")
 7672            for calculation_key in calculations_list:
 7673                log.info(f"   {calculation_key}")
 7674            for calculation_operation in calculations_list:
 7675                if calculation_operation.upper() not in operations:
 7676                    operations[calculation_operation.upper()] = {}
 7677                    add_value_into_dict(
 7678                        dict_tree=param,
 7679                        sections=[
 7680                            "calculation",
 7681                            "calculations",
 7682                            calculation_operation.upper(),
 7683                        ],
 7684                        value={},
 7685                    )
 7686
 7687        # Operations for calculation
 7688        if not operations:
 7689            operations = param.get("calculation", {}).get("calculations", {})
 7690
 7691        if operations:
 7692            log.info(f"Calculations...")
 7693
 7694        # For each operations
 7695        for operation_name in operations:
 7696            operation_name = operation_name.upper()
 7697            if operation_name not in [""]:
 7698                if operation_name in operations_config:
 7699                    log.info(f"Calculation '{operation_name}'")
 7700                    operation = operations_config[operation_name]
 7701                    operation_type = operation.get("type", "sql")
 7702                    if operation_type == "python":
 7703                        self.calculation_process_function(
 7704                            operation=operation, operation_name=operation_name
 7705                        )
 7706                    elif operation_type == "sql":
 7707                        self.calculation_process_sql(
 7708                            operation=operation, operation_name=operation_name
 7709                        )
 7710                    else:
 7711                        log.error(
 7712                            f"Operations config: Type '{operation_type}' NOT available"
 7713                        )
 7714                        raise ValueError(
 7715                            f"Operations config: Type '{operation_type}' NOT available"
 7716                        )
 7717                else:
 7718                    log.error(
 7719                        f"Operations config: Calculation '{operation_name}' NOT available"
 7720                    )
 7721                    raise ValueError(
 7722                        f"Operations config: Calculation '{operation_name}' NOT available"
 7723                    )
 7724
 7725        # Explode INFOS fields into table fields
 7726        if self.get_explode_infos():
 7727            self.explode_infos(
 7728                prefix=self.get_explode_infos_prefix(),
 7729                fields=self.get_explode_infos_fields(),
 7730                force=True,
 7731            )
 7732
 7733    def calculation_process_sql(
 7734        self, operation: dict, operation_name: str = "unknown"
 7735    ) -> None:
 7736        """
 7737        The `calculation_process_sql` function takes in a mathematical operation as a string and
 7738        performs the operation, updating the specified table with the result.
 7739
 7740        :param operation: The `operation` parameter is a dictionary that contains information about the
 7741        mathematical operation to be performed. It includes the following keys:
 7742        :type operation: dict
 7743        :param operation_name: The `operation_name` parameter is a string that represents the name of
 7744        the mathematical operation being performed. It is used for logging and error handling purposes,
 7745        defaults to unknown
 7746        :type operation_name: str (optional)
 7747        """
 7748
 7749        # table variants
 7750        table_variants = self.get_table_variants(clause="alter")
 7751
 7752        # Operation infos
 7753        operation_name = operation.get("name", "unknown")
 7754        log.debug(f"process sql {operation_name}")
 7755        output_column_name = operation.get("output_column_name", operation_name)
 7756        output_column_type = operation.get("output_column_type", "String")
 7757        prefix = operation.get("explode_infos_prefix", "")
 7758        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 7759        output_column_description = operation.get(
 7760            "output_column_description", f"{operation_name} operation"
 7761        )
 7762        operation_query = operation.get("operation_query", None)
 7763        if isinstance(operation_query, list):
 7764            operation_query = " ".join(operation_query)
 7765        operation_info_fields = operation.get("info_fields", [])
 7766        operation_info_fields_check = operation.get("info_fields_check", False)
 7767        operation_info = operation.get("operation_info", True)
 7768
 7769        if operation_query:
 7770
 7771            # Info fields check
 7772            operation_info_fields_check_result = True
 7773            if operation_info_fields_check:
 7774                header_infos = self.get_header().infos
 7775                for info_field in operation_info_fields:
 7776                    operation_info_fields_check_result = (
 7777                        operation_info_fields_check_result
 7778                        and info_field in header_infos
 7779                    )
 7780
 7781            # If info fields available
 7782            if operation_info_fields_check_result:
 7783
 7784                # Added_columns
 7785                added_columns = []
 7786
 7787                # Create VCF header field
 7788                vcf_reader = self.get_header()
 7789                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 7790                    output_column_name,
 7791                    ".",
 7792                    output_column_type,
 7793                    output_column_description,
 7794                    "howard calculation",
 7795                    "0",
 7796                    self.code_type_map.get(output_column_type),
 7797                )
 7798
 7799                # Explode infos if needed
 7800                log.debug(f"calculation_process_sql prefix {prefix}")
 7801                added_columns += self.explode_infos(
 7802                    prefix=prefix,
 7803                    fields=[output_column_name] + operation_info_fields,
 7804                    force=True,
 7805                )
 7806
 7807                # Create column
 7808                added_column = self.add_column(
 7809                    table_name=table_variants,
 7810                    column_name=prefix + output_column_name,
 7811                    column_type=output_column_type_sql,
 7812                    default_value="null",
 7813                )
 7814                added_columns.append(added_column)
 7815
 7816                # Operation calculation
 7817                try:
 7818
 7819                    # Query to update calculation column
 7820                    sql_update = f"""
 7821                        UPDATE {table_variants}
 7822                        SET "{prefix}{output_column_name}" = ({operation_query})
 7823                    """
 7824                    self.conn.execute(sql_update)
 7825
 7826                    # Add to INFO
 7827                    if operation_info:
 7828                        sql_update_info = f"""
 7829                            UPDATE {table_variants}
 7830                            SET "INFO" =
 7831                                concat(
 7832                                    CASE
 7833                                        WHEN "INFO" IS NOT NULL
 7834                                        THEN concat("INFO", ';')
 7835                                        ELSE ''
 7836                                    END,
 7837                                    '{output_column_name}=',
 7838                                    "{prefix}{output_column_name}"
 7839                                )
 7840                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 7841                        """
 7842                        self.conn.execute(sql_update_info)
 7843
 7844                except:
 7845                    log.error(
 7846                        f"Operations config: Calculation '{operation_name}' query failed"
 7847                    )
 7848                    raise ValueError(
 7849                        f"Operations config: Calculation '{operation_name}' query failed"
 7850                    )
 7851
 7852                # Remove added columns
 7853                for added_column in added_columns:
 7854                    log.debug(f"added_column: {added_column}")
 7855                    self.drop_column(column=added_column)
 7856
 7857            else:
 7858                log.error(
 7859                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 7860                )
 7861                raise ValueError(
 7862                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 7863                )
 7864
 7865        else:
 7866            log.error(
 7867                f"Operations config: Calculation '{operation_name}' query NOT defined"
 7868            )
 7869            raise ValueError(
 7870                f"Operations config: Calculation '{operation_name}' query NOT defined"
 7871            )
 7872
 7873    def calculation_process_function(
 7874        self, operation: dict, operation_name: str = "unknown"
 7875    ) -> None:
 7876        """
 7877        The `calculation_process_function` takes in an operation dictionary and performs the specified
 7878        function with the given parameters.
 7879
 7880        :param operation: The `operation` parameter is a dictionary that contains information about the
 7881        operation to be performed. It has the following keys:
 7882        :type operation: dict
 7883        :param operation_name: The `operation_name` parameter is a string that represents the name of
 7884        the operation being performed. It is used for logging purposes, defaults to unknown
 7885        :type operation_name: str (optional)
 7886        """
 7887
 7888        operation_name = operation["name"]
 7889        log.debug(f"process sql {operation_name}")
 7890        function_name = operation["function_name"]
 7891        function_params = operation["function_params"]
 7892        getattr(self, function_name)(*function_params)
 7893
 7894    def calculation_variant_id(self) -> None:
 7895        """
 7896        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 7897        updates the INFO field of a variants table with the variant ID.
 7898        """
 7899
 7900        # variant_id annotation field
 7901        variant_id_tag = self.get_variant_id_column()
 7902        added_columns = [variant_id_tag]
 7903
 7904        # variant_id hgvs tags"
 7905        vcf_infos_tags = {
 7906            variant_id_tag: "howard variant ID annotation",
 7907        }
 7908
 7909        # Variants table
 7910        table_variants = self.get_table_variants()
 7911
 7912        # Header
 7913        vcf_reader = self.get_header()
 7914
 7915        # Add variant_id to header
 7916        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 7917            variant_id_tag,
 7918            ".",
 7919            "String",
 7920            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 7921            "howard calculation",
 7922            "0",
 7923            self.code_type_map.get("String"),
 7924        )
 7925
 7926        # Update
 7927        sql_update = f"""
 7928            UPDATE {table_variants}
 7929            SET "INFO" = 
 7930                concat(
 7931                    CASE
 7932                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 7933                        THEN ''
 7934                        ELSE concat("INFO", ';')
 7935                    END,
 7936                    '{variant_id_tag}=',
 7937                    "{variant_id_tag}"
 7938                )
 7939        """
 7940        self.conn.execute(sql_update)
 7941
 7942        # Remove added columns
 7943        for added_column in added_columns:
 7944            self.drop_column(column=added_column)
 7945
 7946    def calculation_extract_snpeff_hgvs(
 7947        self,
 7948        snpeff_hgvs: str = "snpeff_hgvs",
 7949        snpeff_field: str = "ANN",
 7950    ) -> None:
 7951        """
 7952        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 7953        annotation field in a VCF file and adds them as a new column in the variants table.
 7954
 7955        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 7956        function is used to specify the name of the column that will store the HGVS nomenclatures
 7957        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 7958        snpeff_hgvs
 7959        :type snpeff_hgvs: str (optional)
 7960        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 7961        function represents the field in the VCF file that contains SnpEff annotations. This field is
 7962        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 7963        to ANN
 7964        :type snpeff_field: str (optional)
 7965        """
 7966
 7967        # Snpeff hgvs tags
 7968        vcf_infos_tags = {
 7969            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 7970        }
 7971
 7972        # Prefix
 7973        prefix = self.get_explode_infos_prefix()
 7974        if prefix:
 7975            prefix = "INFO/"
 7976
 7977        # snpEff fields
 7978        speff_ann_infos = prefix + snpeff_field
 7979        speff_hgvs_infos = prefix + snpeff_hgvs
 7980
 7981        # Variants table
 7982        table_variants = self.get_table_variants()
 7983
 7984        # Header
 7985        vcf_reader = self.get_header()
 7986
 7987        # Add columns
 7988        added_columns = []
 7989
 7990        # Explode HGVS field in column
 7991        added_columns += self.explode_infos(fields=[snpeff_field])
 7992
 7993        if snpeff_field in vcf_reader.infos:
 7994
 7995            log.debug(vcf_reader.infos[snpeff_field])
 7996
 7997            # Extract ANN header
 7998            ann_description = vcf_reader.infos[snpeff_field].desc
 7999            pattern = r"'(.+?)'"
 8000            match = re.search(pattern, ann_description)
 8001            if match:
 8002                ann_header_match = match.group(1).split(" | ")
 8003                ann_header_desc = {}
 8004                for i in range(len(ann_header_match)):
 8005                    ann_header_info = "".join(
 8006                        char for char in ann_header_match[i] if char.isalnum()
 8007                    )
 8008                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8009                if not ann_header_desc:
 8010                    raise ValueError("Invalid header description format")
 8011            else:
 8012                raise ValueError("Invalid header description format")
 8013
 8014            # Create variant id
 8015            variant_id_column = self.get_variant_id_column()
 8016            added_columns += [variant_id_column]
 8017
 8018            # Create dataframe
 8019            dataframe_snpeff_hgvs = self.get_query_to_df(
 8020                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8021            )
 8022
 8023            # Create main NOMEN column
 8024            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8025                speff_ann_infos
 8026            ].apply(
 8027                lambda x: extract_snpeff_hgvs(
 8028                    str(x), header=list(ann_header_desc.values())
 8029                )
 8030            )
 8031
 8032            # Add snpeff_hgvs to header
 8033            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8034                snpeff_hgvs,
 8035                ".",
 8036                "String",
 8037                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8038                "howard calculation",
 8039                "0",
 8040                self.code_type_map.get("String"),
 8041            )
 8042
 8043            # Update
 8044            sql_update = f"""
 8045                UPDATE variants
 8046                SET "INFO" = 
 8047                    concat(
 8048                        CASE
 8049                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8050                            THEN ''
 8051                            ELSE concat("INFO", ';')
 8052                        END,
 8053                        CASE 
 8054                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8055                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8056                            THEN concat(
 8057                                    '{snpeff_hgvs}=',
 8058                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8059                                )
 8060                            ELSE ''
 8061                        END
 8062                    )
 8063                FROM dataframe_snpeff_hgvs
 8064                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8065
 8066            """
 8067            self.conn.execute(sql_update)
 8068
 8069            # Delete dataframe
 8070            del dataframe_snpeff_hgvs
 8071            gc.collect()
 8072
 8073        else:
 8074
 8075            log.warning(
 8076                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8077            )
 8078
 8079        # Remove added columns
 8080        for added_column in added_columns:
 8081            self.drop_column(column=added_column)
 8082
 8083    def calculation_snpeff_ann_explode(
 8084        self,
 8085        uniquify: bool = True,
 8086        output_format: str = "fields",
 8087        output_prefix: str = "snpeff_",
 8088        snpeff_field: str = "ANN",
 8089    ) -> None:
 8090        """
 8091        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8092        exploding the HGVS field and updating variant information accordingly.
 8093
 8094        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8095        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8096        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8097        defaults to True
 8098        :type uniquify: bool (optional)
 8099        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8100        function specifies the format in which the output annotations will be generated. It has a
 8101        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8102        format, defaults to fields
 8103        :type output_format: str (optional)
 8104        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8105        method is used to specify the prefix that will be added to the output annotations generated
 8106        during the calculation process. This prefix helps to differentiate the newly added annotations
 8107        from existing ones in the output data. By default, the, defaults to ANN_
 8108        :type output_prefix: str (optional)
 8109        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8110        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8111        field will be processed to explode the HGVS annotations and update the variant information
 8112        accordingly, defaults to ANN
 8113        :type snpeff_field: str (optional)
 8114        """
 8115
 8116        # SnpEff annotation field
 8117        snpeff_hgvs = "snpeff_ann_explode"
 8118
 8119        # Snpeff hgvs tags
 8120        vcf_infos_tags = {
 8121            snpeff_hgvs: "Explode snpEff annotations",
 8122        }
 8123
 8124        # Prefix
 8125        prefix = self.get_explode_infos_prefix()
 8126        if prefix:
 8127            prefix = "INFO/"
 8128
 8129        # snpEff fields
 8130        speff_ann_infos = prefix + snpeff_field
 8131        speff_hgvs_infos = prefix + snpeff_hgvs
 8132
 8133        # Variants table
 8134        table_variants = self.get_table_variants()
 8135
 8136        # Header
 8137        vcf_reader = self.get_header()
 8138
 8139        # Add columns
 8140        added_columns = []
 8141
 8142        # Explode HGVS field in column
 8143        added_columns += self.explode_infos(fields=[snpeff_field])
 8144        log.debug(f"snpeff_field={snpeff_field}")
 8145        log.debug(f"added_columns={added_columns}")
 8146
 8147        if snpeff_field in vcf_reader.infos:
 8148
 8149            # Extract ANN header
 8150            ann_description = vcf_reader.infos[snpeff_field].desc
 8151            pattern = r"'(.+?)'"
 8152            match = re.search(pattern, ann_description)
 8153            if match:
 8154                ann_header_match = match.group(1).split(" | ")
 8155                ann_header = []
 8156                ann_header_desc = {}
 8157                for i in range(len(ann_header_match)):
 8158                    ann_header_info = "".join(
 8159                        char for char in ann_header_match[i] if char.isalnum()
 8160                    )
 8161                    ann_header.append(ann_header_info)
 8162                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8163                if not ann_header_desc:
 8164                    raise ValueError("Invalid header description format")
 8165            else:
 8166                raise ValueError("Invalid header description format")
 8167
 8168            # Create variant id
 8169            variant_id_column = self.get_variant_id_column()
 8170            added_columns += [variant_id_column]
 8171
 8172            # Create dataframe
 8173            dataframe_snpeff_hgvs = self.get_query_to_df(
 8174                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8175            )
 8176
 8177            # Create snpEff columns
 8178            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8179                speff_ann_infos
 8180            ].apply(
 8181                lambda x: explode_snpeff_ann(
 8182                    str(x),
 8183                    uniquify=uniquify,
 8184                    output_format=output_format,
 8185                    prefix=output_prefix,
 8186                    header=list(ann_header_desc.values()),
 8187                )
 8188            )
 8189
 8190            # Header
 8191            ann_annotations_prefix = ""
 8192            if output_format.upper() in ["JSON"]:
 8193                ann_annotations_prefix = f"{output_prefix}="
 8194                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8195                    output_prefix,
 8196                    ".",
 8197                    "String",
 8198                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8199                    + " - JSON format",
 8200                    "howard calculation",
 8201                    "0",
 8202                    self.code_type_map.get("String"),
 8203                )
 8204            else:
 8205                for ann_annotation in ann_header:
 8206                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8207                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8208                        ann_annotation_id,
 8209                        ".",
 8210                        "String",
 8211                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8212                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8213                        "howard calculation",
 8214                        "0",
 8215                        self.code_type_map.get("String"),
 8216                    )
 8217
 8218            # Update
 8219            sql_update = f"""
 8220                UPDATE variants
 8221                SET "INFO" = 
 8222                    concat(
 8223                        CASE
 8224                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8225                            THEN ''
 8226                            ELSE concat("INFO", ';')
 8227                        END,
 8228                        CASE 
 8229                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8230                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8231                            THEN concat(
 8232                                '{ann_annotations_prefix}',
 8233                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8234                                )
 8235                            ELSE ''
 8236                        END
 8237                    )
 8238                FROM dataframe_snpeff_hgvs
 8239                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8240
 8241            """
 8242            self.conn.execute(sql_update)
 8243
 8244            # Delete dataframe
 8245            del dataframe_snpeff_hgvs
 8246            gc.collect()
 8247
 8248        else:
 8249
 8250            log.warning(
 8251                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8252            )
 8253
 8254        # Remove added columns
 8255        for added_column in added_columns:
 8256            self.drop_column(column=added_column)
 8257
 8258    def calculation_extract_nomen(self) -> None:
 8259        """
 8260        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8261        """
 8262
 8263        # NOMEN field
 8264        field_nomen_dict = "NOMEN_DICT"
 8265
 8266        # NOMEN structure
 8267        nomen_dict = {
 8268            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8269            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8270            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8271            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8272            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8273            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8274            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8275            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8276            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8277            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8278        }
 8279
 8280        # Param
 8281        param = self.get_param()
 8282
 8283        # Prefix
 8284        prefix = self.get_explode_infos_prefix()
 8285
 8286        # Header
 8287        vcf_reader = self.get_header()
 8288
 8289        # Get HGVS field
 8290        hgvs_field = (
 8291            param.get("calculation", {})
 8292            .get("calculations", {})
 8293            .get("NOMEN", {})
 8294            .get("options", {})
 8295            .get("hgvs_field", "hgvs")
 8296        )
 8297
 8298        # Get transcripts
 8299        transcripts_file = (
 8300            param.get("calculation", {})
 8301            .get("calculations", {})
 8302            .get("NOMEN", {})
 8303            .get("options", {})
 8304            .get("transcripts", None)
 8305        )
 8306        transcripts_file = full_path(transcripts_file)
 8307        transcripts = []
 8308        if transcripts_file:
 8309            if os.path.exists(transcripts_file):
 8310                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8311                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
 8312            else:
 8313                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
 8314                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
 8315
 8316        # Added columns
 8317        added_columns = []
 8318
 8319        # Explode HGVS field in column
 8320        added_columns += self.explode_infos(fields=[hgvs_field])
 8321
 8322        # extra infos
 8323        extra_infos = self.get_extra_infos()
 8324        extra_field = prefix + hgvs_field
 8325
 8326        if extra_field in extra_infos:
 8327
 8328            # Create dataframe
 8329            dataframe_hgvs = self.get_query_to_df(
 8330                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
 8331            )
 8332
 8333            # Create main NOMEN column
 8334            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
 8335                lambda x: find_nomen(str(x), transcripts=transcripts)
 8336            )
 8337
 8338            # Explode NOMEN Structure and create SQL set for update
 8339            sql_nomen_fields = []
 8340            for nomen_field in nomen_dict:
 8341
 8342                # Explode each field into a column
 8343                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
 8344                    lambda x: dict(x).get(nomen_field, "")
 8345                )
 8346
 8347                # Create VCF header field
 8348                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 8349                    nomen_field,
 8350                    ".",
 8351                    "String",
 8352                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 8353                    "howard calculation",
 8354                    "0",
 8355                    self.code_type_map.get("String"),
 8356                )
 8357                sql_nomen_fields.append(
 8358                    f"""
 8359                        CASE 
 8360                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
 8361                            THEN concat(
 8362                                    ';{nomen_field}=',
 8363                                    dataframe_hgvs."{nomen_field}"
 8364                                )
 8365                            ELSE ''
 8366                        END
 8367                    """
 8368                )
 8369
 8370            # SQL set for update
 8371            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 8372
 8373            # Update
 8374            sql_update = f"""
 8375                UPDATE variants
 8376                SET "INFO" = 
 8377                    concat(
 8378                        CASE
 8379                            WHEN "INFO" IS NULL
 8380                            THEN ''
 8381                            ELSE "INFO"
 8382                        END,
 8383                        {sql_nomen_fields_set}
 8384                    )
 8385                FROM dataframe_hgvs
 8386                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 8387                    AND variants."POS" = dataframe_hgvs."POS" 
 8388                    AND variants."REF" = dataframe_hgvs."REF"
 8389                    AND variants."ALT" = dataframe_hgvs."ALT"
 8390            """
 8391            self.conn.execute(sql_update)
 8392
 8393            # Delete dataframe
 8394            del dataframe_hgvs
 8395            gc.collect()
 8396
 8397        # Remove added columns
 8398        for added_column in added_columns:
 8399            self.drop_column(column=added_column)
 8400
 8401    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 8402        """
 8403        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 8404        pipeline/sample for a variant and updates the variant information in a VCF file.
 8405
 8406        :param tag: The `tag` parameter is a string that represents the annotation field for the
 8407        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 8408        VCF header and to update the corresponding field in the variants table, defaults to
 8409        findbypipeline
 8410        :type tag: str (optional)
 8411        """
 8412
 8413        # if FORMAT and samples
 8414        if (
 8415            "FORMAT" in self.get_header_columns_as_list()
 8416            and self.get_header_sample_list()
 8417        ):
 8418
 8419            # findbypipeline annotation field
 8420            findbypipeline_tag = tag
 8421
 8422            # VCF infos tags
 8423            vcf_infos_tags = {
 8424                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 8425            }
 8426
 8427            # Prefix
 8428            prefix = self.get_explode_infos_prefix()
 8429
 8430            # Field
 8431            findbypipeline_infos = prefix + findbypipeline_tag
 8432
 8433            # Variants table
 8434            table_variants = self.get_table_variants()
 8435
 8436            # Header
 8437            vcf_reader = self.get_header()
 8438
 8439            # Create variant id
 8440            variant_id_column = self.get_variant_id_column()
 8441            added_columns = [variant_id_column]
 8442
 8443            # variant_id, FORMAT and samples
 8444            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8445                self.get_header_sample_list()
 8446            )
 8447
 8448            # Create dataframe
 8449            dataframe_findbypipeline = self.get_query_to_df(
 8450                f""" SELECT {samples_fields} FROM {table_variants} """
 8451            )
 8452
 8453            # Create findbypipeline column
 8454            dataframe_findbypipeline[findbypipeline_infos] = (
 8455                dataframe_findbypipeline.apply(
 8456                    lambda row: findbypipeline(
 8457                        row, samples=self.get_header_sample_list()
 8458                    ),
 8459                    axis=1,
 8460                )
 8461            )
 8462
 8463            # Add snpeff_hgvs to header
 8464            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 8465                findbypipeline_tag,
 8466                ".",
 8467                "String",
 8468                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 8469                "howard calculation",
 8470                "0",
 8471                self.code_type_map.get("String"),
 8472            )
 8473
 8474            # Update
 8475            sql_update = f"""
 8476                UPDATE variants
 8477                SET "INFO" = 
 8478                    concat(
 8479                        CASE
 8480                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8481                            THEN ''
 8482                            ELSE concat("INFO", ';')
 8483                        END,
 8484                        CASE 
 8485                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 8486                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 8487                            THEN concat(
 8488                                    '{findbypipeline_tag}=',
 8489                                    dataframe_findbypipeline."{findbypipeline_infos}"
 8490                                )
 8491                            ELSE ''
 8492                        END
 8493                    )
 8494                FROM dataframe_findbypipeline
 8495                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 8496            """
 8497            self.conn.execute(sql_update)
 8498
 8499            # Remove added columns
 8500            for added_column in added_columns:
 8501                self.drop_column(column=added_column)
 8502
 8503            # Delete dataframe
 8504            del dataframe_findbypipeline
 8505            gc.collect()
 8506
 8507    def calculation_genotype_concordance(self) -> None:
 8508        """
 8509        The function `calculation_genotype_concordance` calculates the genotype concordance for
 8510        multi-caller VCF files and updates the variant information in the database.
 8511        """
 8512
 8513        # if FORMAT and samples
 8514        if (
 8515            "FORMAT" in self.get_header_columns_as_list()
 8516            and self.get_header_sample_list()
 8517        ):
 8518
 8519            # genotypeconcordance annotation field
 8520            genotypeconcordance_tag = "genotypeconcordance"
 8521
 8522            # VCF infos tags
 8523            vcf_infos_tags = {
 8524                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 8525            }
 8526
 8527            # Prefix
 8528            prefix = self.get_explode_infos_prefix()
 8529
 8530            # Field
 8531            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 8532
 8533            # Variants table
 8534            table_variants = self.get_table_variants()
 8535
 8536            # Header
 8537            vcf_reader = self.get_header()
 8538
 8539            # Create variant id
 8540            variant_id_column = self.get_variant_id_column()
 8541            added_columns = [variant_id_column]
 8542
 8543            # variant_id, FORMAT and samples
 8544            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8545                self.get_header_sample_list()
 8546            )
 8547
 8548            # Create dataframe
 8549            dataframe_genotypeconcordance = self.get_query_to_df(
 8550                f""" SELECT {samples_fields} FROM {table_variants} """
 8551            )
 8552
 8553            # Create genotypeconcordance column
 8554            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 8555                dataframe_genotypeconcordance.apply(
 8556                    lambda row: genotypeconcordance(
 8557                        row, samples=self.get_header_sample_list()
 8558                    ),
 8559                    axis=1,
 8560                )
 8561            )
 8562
 8563            # Add genotypeconcordance to header
 8564            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 8565                genotypeconcordance_tag,
 8566                ".",
 8567                "String",
 8568                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 8569                "howard calculation",
 8570                "0",
 8571                self.code_type_map.get("String"),
 8572            )
 8573
 8574            # Update
 8575            sql_update = f"""
 8576                UPDATE variants
 8577                SET "INFO" = 
 8578                    concat(
 8579                        CASE
 8580                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8581                            THEN ''
 8582                            ELSE concat("INFO", ';')
 8583                        END,
 8584                        CASE
 8585                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 8586                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 8587                            THEN concat(
 8588                                    '{genotypeconcordance_tag}=',
 8589                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 8590                                )
 8591                            ELSE ''
 8592                        END
 8593                    )
 8594                FROM dataframe_genotypeconcordance
 8595                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 8596            """
 8597            self.conn.execute(sql_update)
 8598
 8599            # Remove added columns
 8600            for added_column in added_columns:
 8601                self.drop_column(column=added_column)
 8602
 8603            # Delete dataframe
 8604            del dataframe_genotypeconcordance
 8605            gc.collect()
 8606
 8607    def calculation_barcode(self, tag: str = "barcode") -> None:
 8608        """
 8609        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 8610        updates the INFO field in the file with the calculated barcode values.
 8611
 8612        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 8613        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 8614        the default tag name is set to "barcode", defaults to barcode
 8615        :type tag: str (optional)
 8616        """
 8617
 8618        # if FORMAT and samples
 8619        if (
 8620            "FORMAT" in self.get_header_columns_as_list()
 8621            and self.get_header_sample_list()
 8622        ):
 8623
 8624            # barcode annotation field
 8625            if not tag:
 8626                tag = "barcode"
 8627
 8628            # VCF infos tags
 8629            vcf_infos_tags = {
 8630                tag: "barcode calculation (VaRank)",
 8631            }
 8632
 8633            # Prefix
 8634            prefix = self.get_explode_infos_prefix()
 8635
 8636            # Field
 8637            barcode_infos = prefix + tag
 8638
 8639            # Variants table
 8640            table_variants = self.get_table_variants()
 8641
 8642            # Header
 8643            vcf_reader = self.get_header()
 8644
 8645            # Create variant id
 8646            variant_id_column = self.get_variant_id_column()
 8647            added_columns = [variant_id_column]
 8648
 8649            # variant_id, FORMAT and samples
 8650            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8651                self.get_header_sample_list()
 8652            )
 8653
 8654            # Create dataframe
 8655            dataframe_barcode = self.get_query_to_df(
 8656                f""" SELECT {samples_fields} FROM {table_variants} """
 8657            )
 8658
 8659            # Create barcode column
 8660            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 8661                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 8662            )
 8663
 8664            # Add barcode to header
 8665            vcf_reader.infos[tag] = vcf.parser._Info(
 8666                tag,
 8667                ".",
 8668                "String",
 8669                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 8670                "howard calculation",
 8671                "0",
 8672                self.code_type_map.get("String"),
 8673            )
 8674
 8675            # Update
 8676            sql_update = f"""
 8677                UPDATE {table_variants}
 8678                SET "INFO" = 
 8679                    concat(
 8680                        CASE
 8681                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8682                            THEN ''
 8683                            ELSE concat("INFO", ';')
 8684                        END,
 8685                        CASE
 8686                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 8687                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 8688                            THEN concat(
 8689                                    '{tag}=',
 8690                                    dataframe_barcode."{barcode_infos}"
 8691                                )
 8692                            ELSE ''
 8693                        END
 8694                    )
 8695                FROM dataframe_barcode
 8696                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 8697            """
 8698            self.conn.execute(sql_update)
 8699
 8700            # Remove added columns
 8701            for added_column in added_columns:
 8702                self.drop_column(column=added_column)
 8703
 8704            # Delete dataframe
 8705            del dataframe_barcode
 8706            gc.collect()
 8707
 8708    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 8709        """
 8710        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 8711        and updates the INFO field in the file with the calculated barcode values.
 8712
 8713        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 8714        the barcode tag that will be added to the VCF file during the calculation process. If no value
 8715        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 8716        :type tag: str (optional)
 8717        """
 8718
 8719        # if FORMAT and samples
 8720        if (
 8721            "FORMAT" in self.get_header_columns_as_list()
 8722            and self.get_header_sample_list()
 8723        ):
 8724
 8725            # barcode annotation field
 8726            if not tag:
 8727                tag = "BCF"
 8728
 8729            # VCF infos tags
 8730            vcf_infos_tags = {
 8731                tag: "barcode family calculation",
 8732                f"{tag}S": "barcode family samples",
 8733            }
 8734
 8735            # Param
 8736            param = self.get_param()
 8737            log.debug(f"param={param}")
 8738
 8739            # Prefix
 8740            prefix = self.get_explode_infos_prefix()
 8741
 8742            # PED param
 8743            ped = (
 8744                param.get("calculation", {})
 8745                .get("calculations", {})
 8746                .get("BARCODEFAMILY", {})
 8747                .get("family_pedigree", None)
 8748            )
 8749            log.debug(f"ped={ped}")
 8750
 8751            # Load PED
 8752            if ped:
 8753
 8754                # Pedigree is a file
 8755                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 8756                    log.debug("Pedigree is file")
 8757                    with open(full_path(ped)) as ped:
 8758                        ped = json.load(ped)
 8759
 8760                # Pedigree is a string
 8761                elif isinstance(ped, str):
 8762                    log.debug("Pedigree is str")
 8763                    try:
 8764                        ped = json.loads(ped)
 8765                        log.debug("Pedigree is json str")
 8766                    except ValueError as e:
 8767                        ped_samples = ped.split(",")
 8768                        ped = {}
 8769                        for ped_sample in ped_samples:
 8770                            ped[ped_sample] = ped_sample
 8771
 8772                # Pedigree is a dict
 8773                elif isinstance(ped, dict):
 8774                    log.debug("Pedigree is dict")
 8775
 8776                # Pedigree is not well formatted
 8777                else:
 8778                    msg_error = "Pedigree not well formatted"
 8779                    log.error(msg_error)
 8780                    raise ValueError(msg_error)
 8781
 8782                # Construct list
 8783                ped_samples = list(ped.values())
 8784
 8785            else:
 8786                log.debug("Pedigree not defined. Take all samples")
 8787                ped_samples = self.get_header_sample_list()
 8788                ped = {}
 8789                for ped_sample in ped_samples:
 8790                    ped[ped_sample] = ped_sample
 8791
 8792            # Check pedigree
 8793            if not ped or len(ped) == 0:
 8794                msg_error = f"Error in pedigree: samples {ped_samples}"
 8795                log.error(msg_error)
 8796                raise ValueError(msg_error)
 8797
 8798            # Log
 8799            log.info(
 8800                "Calculation 'BARCODEFAMILY' - Samples: "
 8801                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 8802            )
 8803            log.debug(f"ped_samples={ped_samples}")
 8804
 8805            # Field
 8806            barcode_infos = prefix + tag
 8807
 8808            # Variants table
 8809            table_variants = self.get_table_variants()
 8810
 8811            # Header
 8812            vcf_reader = self.get_header()
 8813
 8814            # Create variant id
 8815            variant_id_column = self.get_variant_id_column()
 8816            added_columns = [variant_id_column]
 8817
 8818            # variant_id, FORMAT and samples
 8819            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8820                ped_samples
 8821            )
 8822
 8823            # Create dataframe
 8824            dataframe_barcode = self.get_query_to_df(
 8825                f""" SELECT {samples_fields} FROM {table_variants} """
 8826            )
 8827
 8828            # Create barcode column
 8829            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 8830                lambda row: barcode(row, samples=ped_samples), axis=1
 8831            )
 8832
 8833            # Add barcode family to header
 8834            # Add vaf_normalization to header
 8835            vcf_reader.formats[tag] = vcf.parser._Format(
 8836                id=tag,
 8837                num=".",
 8838                type="String",
 8839                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 8840                type_code=self.code_type_map.get("String"),
 8841            )
 8842            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 8843                id=f"{tag}S",
 8844                num=".",
 8845                type="String",
 8846                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 8847                type_code=self.code_type_map.get("String"),
 8848            )
 8849
 8850            # Update
 8851            # for sample in ped_samples:
 8852            sql_update_set = []
 8853            for sample in self.get_header_sample_list() + ["FORMAT"]:
 8854                if sample in ped_samples:
 8855                    value = f'dataframe_barcode."{barcode_infos}"'
 8856                    value_samples = "'" + ",".join(ped_samples) + "'"
 8857                elif sample == "FORMAT":
 8858                    value = f"'{tag}'"
 8859                    value_samples = f"'{tag}S'"
 8860                else:
 8861                    value = "'.'"
 8862                    value_samples = "'.'"
 8863                format_regex = r"[a-zA-Z0-9\s]"
 8864                sql_update_set.append(
 8865                    f"""
 8866                        "{sample}" = 
 8867                        concat(
 8868                            CASE
 8869                                WHEN {table_variants}."{sample}" = './.'
 8870                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 8871                                ELSE {table_variants}."{sample}"
 8872                            END,
 8873                            ':',
 8874                            {value},
 8875                            ':',
 8876                            {value_samples}
 8877                        )
 8878                    """
 8879                )
 8880
 8881            sql_update_set_join = ", ".join(sql_update_set)
 8882            sql_update = f"""
 8883                UPDATE {table_variants}
 8884                SET {sql_update_set_join}
 8885                FROM dataframe_barcode
 8886                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 8887            """
 8888            self.conn.execute(sql_update)
 8889
 8890            # Remove added columns
 8891            for added_column in added_columns:
 8892                self.drop_column(column=added_column)
 8893
 8894            # Delete dataframe
 8895            del dataframe_barcode
 8896            gc.collect()
 8897
 8898    def calculation_trio(self) -> None:
 8899        """
 8900        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 8901        information to the INFO field of each variant.
 8902        """
 8903
 8904        # if FORMAT and samples
 8905        if (
 8906            "FORMAT" in self.get_header_columns_as_list()
 8907            and self.get_header_sample_list()
 8908        ):
 8909
 8910            # trio annotation field
 8911            trio_tag = "trio"
 8912
 8913            # VCF infos tags
 8914            vcf_infos_tags = {
 8915                "trio": "trio calculation",
 8916            }
 8917
 8918            # Param
 8919            param = self.get_param()
 8920
 8921            # Prefix
 8922            prefix = self.get_explode_infos_prefix()
 8923
 8924            # Trio param
 8925            trio_ped = (
 8926                param.get("calculation", {})
 8927                .get("calculations", {})
 8928                .get("TRIO", {})
 8929                .get("trio_pedigree", None)
 8930            )
 8931
 8932            # Load trio
 8933            if trio_ped:
 8934
 8935                # Trio pedigree is a file
 8936                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 8937                    log.debug("TRIO pedigree is file")
 8938                    with open(full_path(trio_ped)) as trio_ped:
 8939                        trio_ped = json.load(trio_ped)
 8940
 8941                # Trio pedigree is a string
 8942                elif isinstance(trio_ped, str):
 8943                    log.debug("TRIO pedigree is str")
 8944                    try:
 8945                        trio_ped = json.loads(trio_ped)
 8946                        log.debug("TRIO pedigree is json str")
 8947                    except ValueError as e:
 8948                        trio_samples = trio_ped.split(",")
 8949                        if len(trio_samples) == 3:
 8950                            trio_ped = {
 8951                                "father": trio_samples[0],
 8952                                "mother": trio_samples[1],
 8953                                "child": trio_samples[2],
 8954                            }
 8955                            log.debug("TRIO pedigree is list str")
 8956                        else:
 8957                            msg_error = "TRIO pedigree not well formatted"
 8958                            log.error(msg_error)
 8959                            raise ValueError(msg_error)
 8960
 8961                # Trio pedigree is a dict
 8962                elif isinstance(trio_ped, dict):
 8963                    log.debug("TRIO pedigree is dict")
 8964
 8965                # Trio pedigree is not well formatted
 8966                else:
 8967                    msg_error = "TRIO pedigree not well formatted"
 8968                    log.error(msg_error)
 8969                    raise ValueError(msg_error)
 8970
 8971                # Construct trio list
 8972                trio_samples = [
 8973                    trio_ped.get("father", ""),
 8974                    trio_ped.get("mother", ""),
 8975                    trio_ped.get("child", ""),
 8976                ]
 8977
 8978            else:
 8979                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 8980                samples_list = self.get_header_sample_list()
 8981                if len(samples_list) >= 3:
 8982                    trio_samples = self.get_header_sample_list()[0:3]
 8983                    trio_ped = {
 8984                        "father": trio_samples[0],
 8985                        "mother": trio_samples[1],
 8986                        "child": trio_samples[2],
 8987                    }
 8988                else:
 8989                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 8990                    log.error(msg_error)
 8991                    raise ValueError(msg_error)
 8992
 8993            # Check trio pedigree
 8994            if not trio_ped or len(trio_ped) != 3:
 8995                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 8996                log.error(msg_error)
 8997                raise ValueError(msg_error)
 8998
 8999            # Log
 9000            log.info(
 9001                f"Calculation 'TRIO' - Samples: "
 9002                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9003            )
 9004
 9005            # Field
 9006            trio_infos = prefix + trio_tag
 9007
 9008            # Variants table
 9009            table_variants = self.get_table_variants()
 9010
 9011            # Header
 9012            vcf_reader = self.get_header()
 9013
 9014            # Create variant id
 9015            variant_id_column = self.get_variant_id_column()
 9016            added_columns = [variant_id_column]
 9017
 9018            # variant_id, FORMAT and samples
 9019            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9020                self.get_header_sample_list()
 9021            )
 9022
 9023            # Create dataframe
 9024            dataframe_trio = self.get_query_to_df(
 9025                f""" SELECT {samples_fields} FROM {table_variants} """
 9026            )
 9027
 9028            # Create trio column
 9029            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9030                lambda row: trio(row, samples=trio_samples), axis=1
 9031            )
 9032
 9033            # Add trio to header
 9034            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9035                trio_tag,
 9036                ".",
 9037                "String",
 9038                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9039                "howard calculation",
 9040                "0",
 9041                self.code_type_map.get("String"),
 9042            )
 9043
 9044            # Update
 9045            sql_update = f"""
 9046                UPDATE {table_variants}
 9047                SET "INFO" = 
 9048                    concat(
 9049                        CASE
 9050                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9051                            THEN ''
 9052                            ELSE concat("INFO", ';')
 9053                        END,
 9054                        CASE
 9055                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9056                             AND dataframe_trio."{trio_infos}" NOT NULL
 9057                            THEN concat(
 9058                                    '{trio_tag}=',
 9059                                    dataframe_trio."{trio_infos}"
 9060                                )
 9061                            ELSE ''
 9062                        END
 9063                    )
 9064                FROM dataframe_trio
 9065                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9066            """
 9067            self.conn.execute(sql_update)
 9068
 9069            # Remove added columns
 9070            for added_column in added_columns:
 9071                self.drop_column(column=added_column)
 9072
 9073            # Delete dataframe
 9074            del dataframe_trio
 9075            gc.collect()
 9076
 9077    def calculation_vaf_normalization(self) -> None:
 9078        """
 9079        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9080        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9081        :return: The function does not return anything.
 9082        """
 9083
 9084        # if FORMAT and samples
 9085        if (
 9086            "FORMAT" in self.get_header_columns_as_list()
 9087            and self.get_header_sample_list()
 9088        ):
 9089
 9090            # vaf_normalization annotation field
 9091            vaf_normalization_tag = "VAF"
 9092
 9093            # VCF infos tags
 9094            vcf_infos_tags = {
 9095                "VAF": "VAF Variant Frequency",
 9096            }
 9097
 9098            # Prefix
 9099            prefix = self.get_explode_infos_prefix()
 9100
 9101            # Variants table
 9102            table_variants = self.get_table_variants()
 9103
 9104            # Header
 9105            vcf_reader = self.get_header()
 9106
 9107            # Do not calculate if VAF already exists
 9108            if "VAF" in vcf_reader.formats:
 9109                log.debug("VAF already on genotypes")
 9110                return
 9111
 9112            # Create variant id
 9113            variant_id_column = self.get_variant_id_column()
 9114            added_columns = [variant_id_column]
 9115
 9116            # variant_id, FORMAT and samples
 9117            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9118                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9119            )
 9120
 9121            # Create dataframe
 9122            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9123            log.debug(f"query={query}")
 9124            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9125
 9126            vaf_normalization_set = []
 9127
 9128            # for each sample vaf_normalization
 9129            for sample in self.get_header_sample_list():
 9130                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9131                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9132                )
 9133                vaf_normalization_set.append(
 9134                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9135                )
 9136
 9137            # Add VAF to FORMAT
 9138            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9139                "FORMAT"
 9140            ].apply(lambda x: str(x) + ":VAF")
 9141            vaf_normalization_set.append(
 9142                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9143            )
 9144
 9145            # Add vaf_normalization to header
 9146            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9147                id=vaf_normalization_tag,
 9148                num="1",
 9149                type="Float",
 9150                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9151                type_code=self.code_type_map.get("Float"),
 9152            )
 9153
 9154            # Create fields to add in INFO
 9155            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9156
 9157            # Update
 9158            sql_update = f"""
 9159                UPDATE {table_variants}
 9160                SET {sql_vaf_normalization_set}
 9161                FROM dataframe_vaf_normalization
 9162                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9163
 9164            """
 9165            self.conn.execute(sql_update)
 9166
 9167            # Remove added columns
 9168            for added_column in added_columns:
 9169                self.drop_column(column=added_column)
 9170
 9171            # Delete dataframe
 9172            del dataframe_vaf_normalization
 9173            gc.collect()
 9174
 9175    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9176        """
 9177        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9178        field in a VCF file and updates the INFO column of the variants table with the calculated
 9179        statistics.
 9180
 9181        :param info: The `info` parameter is a string that represents the type of information for which
 9182        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9183        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9184        maximum value, the mean, the median, defaults to VAF
 9185        :type info: str (optional)
 9186        """
 9187
 9188        # if FORMAT and samples
 9189        if (
 9190            "FORMAT" in self.get_header_columns_as_list()
 9191            and self.get_header_sample_list()
 9192        ):
 9193
 9194            # vaf_stats annotation field
 9195            vaf_stats_tag = info + "_stats"
 9196
 9197            # VCF infos tags
 9198            vcf_infos_tags = {
 9199                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9200                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9201                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9202                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9203                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9204                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9205                info
 9206                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9207            }
 9208
 9209            # Prefix
 9210            prefix = self.get_explode_infos_prefix()
 9211
 9212            # Field
 9213            vaf_stats_infos = prefix + vaf_stats_tag
 9214
 9215            # Variants table
 9216            table_variants = self.get_table_variants()
 9217
 9218            # Header
 9219            vcf_reader = self.get_header()
 9220
 9221            # Create variant id
 9222            variant_id_column = self.get_variant_id_column()
 9223            added_columns = [variant_id_column]
 9224
 9225            # variant_id, FORMAT and samples
 9226            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9227                self.get_header_sample_list()
 9228            )
 9229
 9230            # Create dataframe
 9231            dataframe_vaf_stats = self.get_query_to_df(
 9232                f""" SELECT {samples_fields} FROM {table_variants} """
 9233            )
 9234
 9235            # Create vaf_stats column
 9236            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9237                lambda row: genotype_stats(
 9238                    row, samples=self.get_header_sample_list(), info=info
 9239                ),
 9240                axis=1,
 9241            )
 9242
 9243            # List of vcf tags
 9244            sql_vaf_stats_fields = []
 9245
 9246            # Check all VAF stats infos
 9247            for stat in vcf_infos_tags:
 9248
 9249                # Extract stats
 9250                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9251                    lambda x: dict(x).get(stat, "")
 9252                )
 9253
 9254                # Add snpeff_hgvs to header
 9255                vcf_reader.infos[stat] = vcf.parser._Info(
 9256                    stat,
 9257                    ".",
 9258                    "String",
 9259                    vcf_infos_tags.get(stat, "genotype statistics"),
 9260                    "howard calculation",
 9261                    "0",
 9262                    self.code_type_map.get("String"),
 9263                )
 9264
 9265                if len(sql_vaf_stats_fields):
 9266                    sep = ";"
 9267                else:
 9268                    sep = ""
 9269
 9270                # Create fields to add in INFO
 9271                sql_vaf_stats_fields.append(
 9272                    f"""
 9273                        CASE
 9274                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9275                            THEN concat(
 9276                                    '{sep}{stat}=',
 9277                                    dataframe_vaf_stats."{stat}"
 9278                                )
 9279                            ELSE ''
 9280                        END
 9281                    """
 9282                )
 9283
 9284            # SQL set for update
 9285            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9286
 9287            # Update
 9288            sql_update = f"""
 9289                UPDATE {table_variants}
 9290                SET "INFO" = 
 9291                    concat(
 9292                        CASE
 9293                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9294                            THEN ''
 9295                            ELSE concat("INFO", ';')
 9296                        END,
 9297                        {sql_vaf_stats_fields_set}
 9298                    )
 9299                FROM dataframe_vaf_stats
 9300                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9301
 9302            """
 9303            self.conn.execute(sql_update)
 9304
 9305            # Remove added columns
 9306            for added_column in added_columns:
 9307                self.drop_column(column=added_column)
 9308
 9309            # Delete dataframe
 9310            del dataframe_vaf_stats
 9311            gc.collect()
 9312
 9313    def calculation_transcripts_annotation(
 9314        self, info_json: str = None, info_format: str = None
 9315    ) -> None:
 9316        """
 9317        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
 9318        field to it if transcripts are available.
 9319
 9320        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
 9321        is a string parameter that represents the information field to be used in the transcripts JSON.
 9322        It is used to specify the JSON format for the transcripts information. If no value is provided
 9323        when calling the method, it defaults to "
 9324        :type info_json: str
 9325        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
 9326        method is a string parameter that specifies the format of the information field to be used in
 9327        the transcripts JSON. It is used to define the format of the information field
 9328        :type info_format: str
 9329        """
 9330
 9331        # Create transcripts table
 9332        transcripts_table = self.create_transcript_view()
 9333
 9334        # Add info field
 9335        if transcripts_table:
 9336            self.transcript_view_to_variants(
 9337                transcripts_table=transcripts_table,
 9338                transcripts_info_field_json=info_json,
 9339                transcripts_info_field_format=info_format,
 9340            )
 9341        else:
 9342            log.info("No Transcripts to process. Check param.json file configuration")
 9343
 9344    def calculation_transcripts_prioritization(self) -> None:
 9345        """
 9346        The function `calculation_transcripts_prioritization` creates a transcripts table and
 9347        prioritizes transcripts based on certain criteria.
 9348        """
 9349
 9350        # Create transcripts table
 9351        transcripts_table = self.create_transcript_view()
 9352
 9353        # Add info field
 9354        if transcripts_table:
 9355            self.transcripts_prioritization(transcripts_table=transcripts_table)
 9356        else:
 9357            log.info("No Transcripts to process. Check param.json file configuration")
 9358
 9359    ###############
 9360    # Transcripts #
 9361    ###############
 9362
 9363    def transcripts_prioritization(
 9364        self, transcripts_table: str = None, param: dict = {}
 9365    ) -> bool:
 9366        """
 9367        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
 9368        and updates the variants table with the prioritized information.
 9369
 9370        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
 9371        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
 9372        This parameter is used to identify the table where the transcripts data is stored for the
 9373        prioritization process
 9374        :type transcripts_table: str
 9375        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
 9376        that contains various configuration settings for the prioritization process of transcripts. It
 9377        is used to customize the behavior of the prioritization algorithm and includes settings such as
 9378        the prefix for prioritization fields, default profiles, and other
 9379        :type param: dict
 9380        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
 9381        transcripts prioritization process is successfully completed, and `False` if there are any
 9382        issues or if no profile is defined for transcripts prioritization.
 9383        """
 9384
 9385        log.debug("Start transcripts prioritization...")
 9386
 9387        # Param
 9388        if not param:
 9389            param = self.get_param()
 9390
 9391        # Variants table
 9392        table_variants = self.get_table_variants()
 9393        log.debug(f"transcripts_table={transcripts_table}")
 9394        # Transcripts table
 9395        if transcripts_table is None:
 9396            log.debug(f"transcripts_table={transcripts_table}")
 9397            transcripts_table = self.create_transcript_view(
 9398                transcripts_table="transcripts", param=param
 9399            )
 9400            log.debug(f"transcripts_table={transcripts_table}")
 9401        if transcripts_table is None:
 9402            msg_err = "No Transcripts table availalble"
 9403            log.error(msg_err)
 9404            raise ValueError(msg_err)
 9405
 9406        # Get transcripts columns
 9407        columns_as_list_query = f"""
 9408            DESCRIBE {transcripts_table}
 9409        """
 9410        columns_as_list = list(
 9411            self.get_query_to_df(columns_as_list_query)["column_name"]
 9412        )
 9413
 9414        # Create INFO if not exists
 9415        if "INFO" not in columns_as_list:
 9416            query_add_info = f"""
 9417                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
 9418            """
 9419            self.execute_query(query_add_info)
 9420
 9421        # Prioritization param and Force only PZ Score and Flag
 9422        pz_param = param.get("transcripts", {}).get("prioritization", {})
 9423        pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score"
 9424        pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag"
 9425        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
 9426        pz_param["pzfields"] = [pz_fields_score, pz_fields_flag]
 9427        pz_profile_default = (
 9428            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
 9429        )
 9430
 9431        # Exit if no profile
 9432        if pz_profile_default is None:
 9433            log.warning("No profile defined for transcripts prioritization")
 9434            return False
 9435
 9436        # Prioritization
 9437        prioritization_result = self.prioritization(
 9438            table=transcripts_table,
 9439            pz_param=param.get("transcripts", {}).get("prioritization", {}),
 9440        )
 9441        if not prioritization_result:
 9442            log.warning("Transcripts prioritization not processed")
 9443            return False
 9444
 9445        # Explode PZ fields
 9446        self.explode_infos(
 9447            table=transcripts_table,
 9448            fields=param.get("transcripts", {})
 9449            .get("prioritization", {})
 9450            .get("pzfields", []),
 9451        )
 9452
 9453        # Export Transcripts prioritization infos to variants table
 9454        query_update = f"""
 9455            WITH RankedTranscripts AS (
 9456                SELECT
 9457                    "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag},
 9458                    ROW_NUMBER() OVER (
 9459                        PARTITION BY "#CHROM", POS, REF, ALT
 9460                        ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC
 9461                    ) AS rn
 9462                FROM
 9463                    {transcripts_table}
 9464            )
 9465            UPDATE {table_variants}
 9466                SET
 9467                INFO = CONCAT(CASE
 9468                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9469                            THEN ''
 9470                            ELSE concat("INFO", ';')
 9471                        END,
 9472                        concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag})
 9473                        )
 9474            FROM
 9475                RankedTranscripts
 9476            WHERE
 9477                rn = 1
 9478                AND variants."#CHROM" = RankedTranscripts."#CHROM"
 9479                AND variants."POS" = RankedTranscripts."POS"
 9480                AND variants."REF" = RankedTranscripts."REF"
 9481                AND variants."ALT" = RankedTranscripts."ALT"
 9482                
 9483        """
 9484        self.execute_query(query=query_update)
 9485
 9486        # Add PZ Transcript in header
 9487        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
 9488            pz_fields_transcripts,
 9489            ".",
 9490            "String",
 9491            f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}",
 9492            "unknown",
 9493            "unknown",
 9494            code_type_map["String"],
 9495        )
 9496
 9497        # Return
 9498        return True
 9499
 9500    def create_transcript_view_from_columns_map(
 9501        self,
 9502        transcripts_table: str = "transcripts",
 9503        columns_maps: dict = {},
 9504        added_columns: list = [],
 9505        temporary_tables: list = None,
 9506        annotation_fields: list = None,
 9507    ) -> tuple[list, list, list]:
 9508        """
 9509        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
 9510        specified columns mapping for transcripts data.
 9511
 9512        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
 9513        the table where the transcripts data is stored or will be stored in the database. This table
 9514        typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores,
 9515        predictions, etc. It defaults to "transcripts, defaults to transcripts
 9516        :type transcripts_table: str (optional)
 9517        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about
 9518        how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list
 9519        represents a mapping configuration for a specific set of columns. It typically includes details such
 9520        as the main transcript column and additional information columns
 9521        :type columns_maps: dict
 9522        :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map`
 9523        function is a list that stores the additional columns that will be added to the view being created
 9524        based on the columns map provided. These columns are generated by exploding the transcript
 9525        information columns along with the main transcript column
 9526        :type added_columns: list
 9527        :param temporary_tables: The `temporary_tables` parameter in the
 9528        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
 9529        tables created during the process of creating a transcript view from a columns map. These temporary
 9530        tables are used to store intermediate results or transformations before the final view is generated
 9531        :type temporary_tables: list
 9532        :param annotation_fields: The `annotation_fields` parameter in the
 9533        `create_transcript_view_from_columns_map` function is a list that stores the fields that are used
 9534        for annotation in the query view creation process. These fields are extracted from the
 9535        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
 9536        :type annotation_fields: list
 9537        :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three
 9538        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
 9539        """
 9540
 9541        log.debug("Start transcrpts view creation from columns map...")
 9542
 9543        # "from_columns_map": [
 9544        #     {
 9545        #         "transcripts_column": "Ensembl_transcriptid",
 9546        #         "transcripts_infos_columns": [
 9547        #             "genename",
 9548        #             "Ensembl_geneid",
 9549        #             "LIST_S2_score",
 9550        #             "LIST_S2_pred",
 9551        #         ],
 9552        #     },
 9553        #     {
 9554        #         "transcripts_column": "Ensembl_transcriptid",
 9555        #         "transcripts_infos_columns": [
 9556        #             "genename",
 9557        #             "VARITY_R_score",
 9558        #             "Aloft_pred",
 9559        #         ],
 9560        #     },
 9561        # ],
 9562
 9563        # Init
 9564        if temporary_tables is None:
 9565            temporary_tables = []
 9566        if annotation_fields is None:
 9567            annotation_fields = []
 9568
 9569        # Variants table
 9570        table_variants = self.get_table_variants()
 9571
 9572        for columns_map in columns_maps:
 9573
 9574            # Transcript column
 9575            transcripts_column = columns_map.get("transcripts_column", None)
 9576
 9577            # Transcripts infos columns
 9578            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
 9579
 9580            if transcripts_column is not None:
 9581
 9582                # Explode
 9583                added_columns += self.explode_infos(
 9584                    fields=[transcripts_column] + transcripts_infos_columns
 9585                )
 9586
 9587                # View clauses
 9588                clause_select = []
 9589                for field in [transcripts_column] + transcripts_infos_columns:
 9590                    clause_select.append(
 9591                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
 9592                    )
 9593                    if field not in [transcripts_column]:
 9594                        annotation_fields.append(field)
 9595
 9596                # Querey View
 9597                query = f""" 
 9598                    SELECT
 9599                        "#CHROM", POS, REF, ALT,
 9600                        "{transcripts_column}" AS 'transcript',
 9601                        {", ".join(clause_select)}
 9602                    FROM (
 9603                        SELECT 
 9604                            "#CHROM", POS, REF, ALT,
 9605                            {", ".join(clause_select)}
 9606                        FROM {table_variants}
 9607                        )
 9608                    WHERE "{transcripts_column}" IS NOT NULL
 9609                """
 9610
 9611                # Create temporary table
 9612                temporary_table = transcripts_table + "".join(
 9613                    random.choices(string.ascii_uppercase + string.digits, k=10)
 9614                )
 9615
 9616                # Temporary_tables
 9617                temporary_tables.append(temporary_table)
 9618                query_view = f"""
 9619                    CREATE TEMPORARY TABLE {temporary_table}
 9620                    AS ({query})
 9621                """
 9622                self.execute_query(query=query_view)
 9623
 9624        return added_columns, temporary_tables, annotation_fields
 9625
 9626    def create_transcript_view_from_column_format(
 9627        self,
 9628        transcripts_table: str = "transcripts",
 9629        column_formats: dict = {},
 9630        temporary_tables: list = None,
 9631        annotation_fields: list = None,
 9632    ) -> tuple[list, list, list]:
 9633        """
 9634        The `create_transcript_view_from_column_format` function generates a transcript view based on
 9635        specified column formats, adds additional columns and annotation fields, and returns the list of
 9636        temporary tables and annotation fields.
 9637
 9638        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
 9639        the table containing the transcripts data. This table will be used as the base table for creating
 9640        the transcript view. The default value for this parameter is "transcripts", but you can provide a
 9641        different table name if needed, defaults to transcripts
 9642        :type transcripts_table: str (optional)
 9643        :param column_formats: The `column_formats` parameter is a dictionary that contains information
 9644        about the columns to be used for creating the transcript view. Each entry in the dictionary
 9645        specifies the mapping between a transcripts column and a transcripts infos column. For example, in
 9646        the provided code snippet:
 9647        :type column_formats: dict
 9648        :param temporary_tables: The `temporary_tables` parameter in the
 9649        `create_transcript_view_from_column_format` function is a list that stores the names of temporary
 9650        views created during the process of creating a transcript view from a column format. These temporary
 9651        views are used to manipulate and extract data before generating the final transcript view. It
 9652        :type temporary_tables: list
 9653        :param annotation_fields: The `annotation_fields` parameter in the
 9654        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
 9655        that are extracted from the temporary views created during the process. These annotation fields are
 9656        obtained by querying the temporary views and extracting the column names excluding specific columns
 9657        like `#CH
 9658        :type annotation_fields: list
 9659        :return: The `create_transcript_view_from_column_format` function returns two lists:
 9660        `temporary_tables` and `annotation_fields`.
 9661        """
 9662
 9663        log.debug("Start transcrpts view creation from column format...")
 9664
 9665        #  "from_column_format": [
 9666        #     {
 9667        #         "transcripts_column": "ANN",
 9668        #         "transcripts_infos_column": "Feature_ID",
 9669        #     }
 9670        # ],
 9671
 9672        # Init
 9673        if temporary_tables is None:
 9674            temporary_tables = []
 9675        if annotation_fields is None:
 9676            annotation_fields = []
 9677
 9678        for column_format in column_formats:
 9679
 9680            # annotation field and transcript annotation field
 9681            annotation_field = column_format.get("transcripts_column", "ANN")
 9682            transcript_annotation = column_format.get(
 9683                "transcripts_infos_column", "Feature_ID"
 9684            )
 9685
 9686            # Temporary View name
 9687            temporary_view_name = transcripts_table + "".join(
 9688                random.choices(string.ascii_uppercase + string.digits, k=10)
 9689            )
 9690
 9691            # Create temporary view name
 9692            temporary_view_name = self.annotation_format_to_table(
 9693                uniquify=True,
 9694                annotation_field=annotation_field,
 9695                view_name=temporary_view_name,
 9696                annotation_id=transcript_annotation,
 9697            )
 9698
 9699            # Annotation fields
 9700            if temporary_view_name:
 9701                query_annotation_fields = f"""
 9702                    SELECT *
 9703                    FROM (
 9704                        DESCRIBE SELECT *
 9705                        FROM {temporary_view_name}
 9706                        )
 9707                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
 9708                """
 9709                df_annotation_fields = self.get_query_to_df(
 9710                    query=query_annotation_fields
 9711                )
 9712
 9713                # Add temporary view and annotation fields
 9714                temporary_tables.append(temporary_view_name)
 9715                annotation_fields += list(set(df_annotation_fields["column_name"]))
 9716
 9717        return temporary_tables, annotation_fields
 9718
 9719    def create_transcript_view(
 9720        self,
 9721        transcripts_table: str = None,
 9722        transcripts_table_drop: bool = True,
 9723        param: dict = {},
 9724    ) -> str:
 9725        """
 9726        The `create_transcript_view` function generates a transcript view by processing data from a
 9727        specified table based on provided parameters and structural information.
 9728
 9729        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
 9730        is used to specify the name of the table that will store the final transcript view data. If a table
 9731        name is not provided, the function will create a new table to store the transcript view data, and by
 9732        default,, defaults to transcripts
 9733        :type transcripts_table: str (optional)
 9734        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
 9735        `create_transcript_view` function is a boolean parameter that determines whether to drop the
 9736        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
 9737        the function will drop the existing transcripts table if it exists, defaults to True
 9738        :type transcripts_table_drop: bool (optional)
 9739        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
 9740        contains information needed to create a transcript view. It includes details such as the structure
 9741        of the transcripts, columns mapping, column formats, and other necessary information for generating
 9742        the view. This parameter allows for flexibility and customization
 9743        :type param: dict
 9744        :return: The `create_transcript_view` function returns the name of the transcripts table that was
 9745        created or modified during the execution of the function.
 9746        """
 9747
 9748        log.debug("Start transcripts view creation...")
 9749
 9750        # Default
 9751        transcripts_table_default = "transcripts"
 9752
 9753        # Param
 9754        if not param:
 9755            param = self.get_param()
 9756
 9757        # Struct
 9758        struct = param.get("transcripts", {}).get("struct", None)
 9759
 9760        if struct:
 9761
 9762            # Transcripts table
 9763            if transcripts_table is None:
 9764                transcripts_table = param.get("transcripts", {}).get(
 9765                    "table", transcripts_table_default
 9766                )
 9767
 9768            # added_columns
 9769            added_columns = []
 9770
 9771            # Temporary tables
 9772            temporary_tables = []
 9773
 9774            # Annotation fields
 9775            annotation_fields = []
 9776
 9777            # from columns map
 9778            columns_maps = struct.get("from_columns_map", [])
 9779            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
 9780                self.create_transcript_view_from_columns_map(
 9781                    transcripts_table=transcripts_table,
 9782                    columns_maps=columns_maps,
 9783                    added_columns=added_columns,
 9784                    temporary_tables=temporary_tables,
 9785                    annotation_fields=annotation_fields,
 9786                )
 9787            )
 9788            added_columns += added_columns_tmp
 9789            temporary_tables += temporary_tables_tmp
 9790            annotation_fields += annotation_fields_tmp
 9791
 9792            # from column format
 9793            column_formats = struct.get("from_column_format", [])
 9794            temporary_tables_tmp, annotation_fields_tmp = (
 9795                self.create_transcript_view_from_column_format(
 9796                    transcripts_table=transcripts_table,
 9797                    column_formats=column_formats,
 9798                    temporary_tables=temporary_tables,
 9799                    annotation_fields=annotation_fields,
 9800                )
 9801            )
 9802            temporary_tables += temporary_tables_tmp
 9803            annotation_fields += annotation_fields_tmp
 9804
 9805            # Merge temporary tables query
 9806            query_merge = ""
 9807            for temporary_table in temporary_tables:
 9808
 9809                # First temporary table
 9810                if not query_merge:
 9811                    query_merge = f"""
 9812                        SELECT * FROM {temporary_table}
 9813                    """
 9814                # other temporary table (using UNION)
 9815                else:
 9816                    query_merge += f"""
 9817                        UNION BY NAME SELECT * FROM {temporary_table}
 9818                    """
 9819
 9820            # Merge on transcript
 9821            query_merge_on_transcripts_annotation_fields = []
 9822            # Aggregate all annotations fields
 9823            for annotation_field in set(annotation_fields):
 9824                query_merge_on_transcripts_annotation_fields.append(
 9825                    f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """
 9826                )
 9827            # Query for transcripts view
 9828            query_merge_on_transcripts = f"""
 9829                SELECT "#CHROM", POS, REF, ALT, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)}
 9830                FROM ({query_merge})
 9831                GROUP BY "#CHROM", POS, REF, ALT, transcript
 9832            """
 9833
 9834            # Drop transcript view is necessary
 9835            if transcripts_table_drop:
 9836                query_drop = f"""
 9837                    DROP TABLE IF EXISTS {transcripts_table};
 9838                """
 9839                self.execute_query(query=query_drop)
 9840
 9841            # Merge and create transcript view
 9842            query_create_view = f"""
 9843                CREATE TABLE IF NOT EXISTS {transcripts_table}
 9844                AS {query_merge_on_transcripts}
 9845            """
 9846            self.execute_query(query=query_create_view)
 9847
 9848            # Remove added columns
 9849            for added_column in added_columns:
 9850                self.drop_column(column=added_column)
 9851
 9852        else:
 9853
 9854            transcripts_table = None
 9855
 9856        return transcripts_table
 9857
 9858    def annotation_format_to_table(
 9859        self,
 9860        uniquify: bool = True,
 9861        annotation_field: str = "ANN",
 9862        annotation_id: str = "Feature_ID",
 9863        view_name: str = "transcripts",
 9864    ) -> str:
 9865        """
 9866        The function `annotation_format_to_table` converts annotation data from a VCF file into a structured
 9867        table format.
 9868
 9869        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique
 9870        values in the output or not. If set to `True`, the function will make sure that the output values
 9871        are unique, defaults to True
 9872        :type uniquify: bool (optional)
 9873        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that
 9874        contains the annotation information for each variant. This field is used to extract the annotation
 9875        details for further processing in the function, defaults to ANN
 9876        :type annotation_field: str (optional)
 9877        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is
 9878        used to specify the identifier for the annotation feature. This identifier will be used as a column
 9879        name in the resulting table or view that is created based on the annotation data. It helps in
 9880        uniquely identifying each annotation entry in the, defaults to Feature_ID
 9881        :type annotation_id: str (optional)
 9882        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to
 9883        specify the name of the temporary table that will be created to store the transformed annotation
 9884        data. This table will hold the extracted information from the annotation field in a structured
 9885        format for further processing or analysis, defaults to transcripts
 9886        :type view_name: str (optional)
 9887        :return: The function `annotation_format_to_table` is returning the name of the view created, which
 9888        is stored in the variable `view_name`.
 9889        """
 9890
 9891        # Annotation field
 9892        annotation_format = "annotation_explode"
 9893
 9894        # Transcript annotation
 9895        annotation_id = "".join(char for char in annotation_id if char.isalnum())
 9896
 9897        # Prefix
 9898        prefix = self.get_explode_infos_prefix()
 9899        if prefix:
 9900            prefix = "INFO/"
 9901
 9902        # Annotation fields
 9903        annotation_infos = prefix + annotation_field
 9904        annotation_format_infos = prefix + annotation_format
 9905
 9906        # Variants table
 9907        table_variants = self.get_table_variants()
 9908
 9909        # Header
 9910        vcf_reader = self.get_header()
 9911
 9912        # Add columns
 9913        added_columns = []
 9914
 9915        # Explode HGVS field in column
 9916        added_columns += self.explode_infos(fields=[annotation_field])
 9917
 9918        if annotation_field in vcf_reader.infos:
 9919
 9920            # Extract ANN header
 9921            ann_description = vcf_reader.infos[annotation_field].desc
 9922            pattern = r"'(.+?)'"
 9923            match = re.search(pattern, ann_description)
 9924            if match:
 9925                ann_header_match = match.group(1).split(" | ")
 9926                ann_header = []
 9927                ann_header_desc = {}
 9928                for i in range(len(ann_header_match)):
 9929                    ann_header_info = "".join(
 9930                        char for char in ann_header_match[i] if char.isalnum()
 9931                    )
 9932                    ann_header.append(ann_header_info)
 9933                    ann_header_desc[ann_header_info] = ann_header_match[i]
 9934                if not ann_header_desc:
 9935                    raise ValueError("Invalid header description format")
 9936            else:
 9937                raise ValueError("Invalid header description format")
 9938
 9939            # Create variant id
 9940            variant_id_column = self.get_variant_id_column()
 9941            added_columns += [variant_id_column]
 9942
 9943            # Create dataframe
 9944            dataframe_annotation_format = self.get_query_to_df(
 9945                f""" SELECT "#CHROM", POS, REF, ALT, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
 9946            )
 9947
 9948            # Create annotation columns
 9949            dataframe_annotation_format[
 9950                annotation_format_infos
 9951            ] = dataframe_annotation_format[annotation_infos].apply(
 9952                lambda x: explode_annotation_format(
 9953                    annotation=str(x),
 9954                    uniquify=uniquify,
 9955                    output_format="JSON",
 9956                    prefix="",
 9957                    header=list(ann_header_desc.values()),
 9958                )
 9959            )
 9960
 9961            # Find keys
 9962            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
 9963            df_keys = self.get_query_to_df(query=query_json)
 9964
 9965            # Check keys
 9966            query_json_key = []
 9967            for _, row in df_keys.iterrows():
 9968
 9969                # Key
 9970                key = row.iloc[0]
 9971
 9972                # key_clean
 9973                key_clean = "".join(char for char in key if char.isalnum())
 9974
 9975                # Type
 9976                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
 9977
 9978                # Get DataFrame from query
 9979                df_json_type = self.get_query_to_df(query=query_json_type)
 9980
 9981                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
 9982                with pd.option_context("future.no_silent_downcasting", True):
 9983                    df_json_type.fillna(value="", inplace=True)
 9984                    replace_dict = {None: np.nan, "": np.nan}
 9985                    df_json_type.replace(replace_dict, inplace=True)
 9986                    df_json_type.dropna(inplace=True)
 9987
 9988                # Detect column type
 9989                column_type = detect_column_type(df_json_type[key_clean])
 9990
 9991                # Append
 9992                query_json_key.append(
 9993                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
 9994                )
 9995
 9996            # Create view
 9997            query_view = f"""CREATE TEMPORARY TABLE {view_name} AS (SELECT *, {annotation_id} AS 'transcript' FROM (SELECT "#CHROM", POS, REF, ALT, {",".join(query_json_key)} FROM dataframe_annotation_format));"""
 9998            self.execute_query(query=query_view)
 9999
10000        else:
10001
10002            # Return None
10003            view_name = None
10004
10005        # Remove added columns
10006        for added_column in added_columns:
10007            self.drop_column(column=added_column)
10008
10009        return view_name
10010
10011    def transcript_view_to_variants(
10012        self,
10013        transcripts_table: str = None,
10014        transcripts_column_id: str = None,
10015        transcripts_info_json: str = None,
10016        transcripts_info_field_json: str = None,
10017        transcripts_info_format: str = None,
10018        transcripts_info_field_format: str = None,
10019        param: dict = {},
10020    ) -> bool:
10021        """
10022        The `transcript_view_to_variants` function updates a variants table with information from
10023        transcripts in JSON format.
10024
10025        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
10026        table containing the transcripts data. If this parameter is not provided, the function will
10027        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
10028        :type transcripts_table: str
10029        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
10030        column in the `transcripts_table` that contains the unique identifier for each transcript. This
10031        identifier is used to match transcripts with variants in the database
10032        :type transcripts_column_id: str
10033        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
10034        of the column in the variants table where the transcripts information will be stored in JSON
10035        format. This parameter allows you to define the column in the variants table that will hold the
10036        JSON-formatted information about transcripts
10037        :type transcripts_info_json: str
10038        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
10039        specify the field in the VCF header that will contain information about transcripts in JSON
10040        format. This field will be added to the VCF header as an INFO field with the specified name
10041        :type transcripts_info_field_json: str
10042        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
10043        format of the information about transcripts that will be stored in the variants table. This
10044        format can be used to define how the transcript information will be structured or displayed
10045        within the variants table
10046        :type transcripts_info_format: str
10047        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
10048        specify the field in the VCF header that will contain information about transcripts in a
10049        specific format. This field will be added to the VCF header as an INFO field with the specified
10050        name
10051        :type transcripts_info_field_format: str
10052        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
10053        that contains various configuration settings related to transcripts. It is used to provide
10054        default values for certain parameters if they are not explicitly provided when calling the
10055        method. The `param` dictionary can be passed as an argument
10056        :type param: dict
10057        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
10058        if the operation is successful and `False` if certain conditions are not met.
10059        """
10060
10061        msg_info_prefix = "Start transcripts view to variants annotations"
10062
10063        log.debug(f"{msg_info_prefix}...")
10064
10065        # Default
10066        transcripts_table_default = "transcripts"
10067        transcripts_column_id_default = "transcript"
10068        transcripts_info_json_default = None
10069        transcripts_info_format_default = None
10070        transcripts_info_field_json_default = None
10071        transcripts_info_field_format_default = None
10072
10073        # Param
10074        if not param:
10075            param = self.get_param()
10076
10077        # Transcripts table
10078        if transcripts_table is None:
10079            transcripts_table = param.get("transcripts", {}).get(
10080                "table", transcripts_table_default
10081            )
10082
10083        # Transcripts column ID
10084        if transcripts_column_id is None:
10085            transcripts_column_id = param.get("transcripts", {}).get(
10086                "column_id", transcripts_column_id_default
10087            )
10088
10089        # Transcripts info json
10090        if transcripts_info_json is None:
10091            transcripts_info_json = param.get("transcripts", {}).get(
10092                "transcripts_info_json", transcripts_info_json_default
10093            )
10094
10095        # Transcripts info field JSON
10096        if transcripts_info_field_json is None:
10097            transcripts_info_field_json = param.get("transcripts", {}).get(
10098                "transcripts_info_field_json", transcripts_info_field_json_default
10099            )
10100        # if transcripts_info_field_json is not None and transcripts_info_json is None:
10101        #     transcripts_info_json = transcripts_info_field_json
10102
10103        # Transcripts info format
10104        if transcripts_info_format is None:
10105            transcripts_info_format = param.get("transcripts", {}).get(
10106                "transcripts_info_format", transcripts_info_format_default
10107            )
10108
10109        # Transcripts info field FORMAT
10110        if transcripts_info_field_format is None:
10111            transcripts_info_field_format = param.get("transcripts", {}).get(
10112                "transcripts_info_field_format", transcripts_info_field_format_default
10113            )
10114        # if (
10115        #     transcripts_info_field_format is not None
10116        #     and transcripts_info_format is None
10117        # ):
10118        #     transcripts_info_format = transcripts_info_field_format
10119
10120        # Variants table
10121        table_variants = self.get_table_variants()
10122
10123        # Check info columns param
10124        if (
10125            transcripts_info_json is None
10126            and transcripts_info_field_json is None
10127            and transcripts_info_format is None
10128            and transcripts_info_field_format is None
10129        ):
10130            return False
10131
10132        # Transcripts infos columns
10133        query_transcripts_infos_columns = f"""
10134            SELECT *
10135            FROM (
10136                DESCRIBE SELECT * FROM {transcripts_table}
10137                )
10138            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
10139        """
10140        transcripts_infos_columns = list(
10141            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
10142        )
10143
10144        # View results
10145        clause_select = []
10146        clause_to_json = []
10147        clause_to_format = []
10148        for field in transcripts_infos_columns:
10149            clause_select.append(
10150                f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10151            )
10152            clause_to_json.append(f""" '{field}': "{field}" """)
10153            clause_to_format.append(f""" "{field}" """)
10154
10155        # Update
10156        update_set_json = []
10157        update_set_format = []
10158
10159        # VCF header
10160        vcf_reader = self.get_header()
10161
10162        # Transcripts to info column in JSON
10163        if transcripts_info_json is not None:
10164
10165            # Create column on variants table
10166            self.add_column(
10167                table_name=table_variants,
10168                column_name=transcripts_info_json,
10169                column_type="JSON",
10170                default_value=None,
10171                drop=False,
10172            )
10173
10174            # Add header
10175            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
10176                transcripts_info_json,
10177                ".",
10178                "String",
10179                "Transcripts in JSON format",
10180                "unknwon",
10181                "unknwon",
10182                self.code_type_map["String"],
10183            )
10184
10185            # Add to update
10186            update_set_json.append(
10187                f""" {transcripts_info_json}=t.{transcripts_info_json} """
10188            )
10189
10190        # Transcripts to info field in JSON
10191        if transcripts_info_field_json is not None:
10192
10193            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
10194
10195            # Add to update
10196            update_set_json.append(
10197                f""" 
10198                    INFO = concat(
10199                            CASE
10200                                WHEN INFO NOT IN ('', '.')
10201                                THEN INFO
10202                                ELSE ''
10203                            END,
10204                            CASE
10205                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
10206                                THEN concat(
10207                                    ';{transcripts_info_field_json}=',
10208                                    t.{transcripts_info_json}
10209                                )
10210                                ELSE ''
10211                            END
10212                            )
10213                """
10214            )
10215
10216            # Add header
10217            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
10218                transcripts_info_field_json,
10219                ".",
10220                "String",
10221                "Transcripts in JSON format",
10222                "unknwon",
10223                "unknwon",
10224                self.code_type_map["String"],
10225            )
10226
10227        if update_set_json:
10228
10229            # Update query
10230            query_update = f"""
10231                UPDATE {table_variants}
10232                    SET {", ".join(update_set_json)}
10233                FROM
10234                (
10235                    SELECT
10236                        "#CHROM", POS, REF, ALT,
10237                            concat(
10238                            '{{',
10239                            string_agg(
10240                                '"' || "{transcripts_column_id}" || '":' ||
10241                                to_json(json_output)
10242                            ),
10243                            '}}'
10244                            )::JSON AS {transcripts_info_json}
10245                    FROM
10246                        (
10247                        SELECT
10248                            "#CHROM", POS, REF, ALT,
10249                            "{transcripts_column_id}",
10250                            to_json(
10251                                {{{",".join(clause_to_json)}}}
10252                            )::JSON AS json_output
10253                        FROM
10254                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10255                        WHERE "{transcripts_column_id}" IS NOT NULL
10256                        )
10257                    GROUP BY "#CHROM", POS, REF, ALT
10258                ) AS t
10259                WHERE {table_variants}."#CHROM" = t."#CHROM"
10260                    AND {table_variants}."POS" = t."POS"
10261                    AND {table_variants}."REF" = t."REF"
10262                    AND {table_variants}."ALT" = t."ALT"
10263            """
10264
10265            self.execute_query(query=query_update)
10266
10267        # Transcripts to info column in FORMAT
10268        if transcripts_info_format is not None:
10269
10270            # Create column on variants table
10271            self.add_column(
10272                table_name=table_variants,
10273                column_name=transcripts_info_format,
10274                column_type="VARCHAR",
10275                default_value=None,
10276                drop=False,
10277            )
10278
10279            # Add header
10280            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
10281                transcripts_info_format,
10282                ".",
10283                "String",
10284                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10285                "unknwon",
10286                "unknwon",
10287                self.code_type_map["String"],
10288            )
10289
10290            # Add to update
10291            update_set_format.append(
10292                f""" {transcripts_info_format}=t.{transcripts_info_format} """
10293            )
10294
10295        # Transcripts to info field in JSON
10296        if transcripts_info_field_format is not None:
10297
10298            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
10299
10300            # Add to update
10301            update_set_format.append(
10302                f""" 
10303                    INFO = concat(
10304                            CASE
10305                                WHEN INFO NOT IN ('', '.')
10306                                THEN INFO
10307                                ELSE ''
10308                            END,
10309                            CASE
10310                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
10311                                THEN concat(
10312                                    ';{transcripts_info_field_format}=',
10313                                    t.{transcripts_info_format}
10314                                )
10315                                ELSE ''
10316                            END
10317                            )
10318                """
10319            )
10320
10321            # Add header
10322            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
10323                transcripts_info_field_format,
10324                ".",
10325                "String",
10326                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10327                "unknwon",
10328                "unknwon",
10329                self.code_type_map["String"],
10330            )
10331
10332        if update_set_format:
10333
10334            # Update query
10335            query_update = f"""
10336                UPDATE {table_variants}
10337                    SET {", ".join(update_set_format)}
10338                FROM
10339                (
10340                    SELECT
10341                        "#CHROM", POS, REF, ALT,
10342                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
10343                    FROM 
10344                        (
10345                        SELECT
10346                            "#CHROM", POS, REF, ALT,
10347                            "{transcripts_column_id}",
10348                            concat(
10349                                "{transcripts_column_id}",
10350                                '|',
10351                                {", '|', ".join(clause_to_format)}
10352                            ) AS {transcripts_info_format}
10353                        FROM
10354                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10355                        )
10356                    GROUP BY "#CHROM", POS, REF, ALT
10357                ) AS t
10358                WHERE {table_variants}."#CHROM" = t."#CHROM"
10359                    AND {table_variants}."POS" = t."POS"
10360                    AND {table_variants}."REF" = t."REF"
10361                    AND {table_variants}."ALT" = t."ALT"
10362            """
10363
10364            self.execute_query(query=query_update)
10365
10366        return True
Variants( conn=None, input: str = None, output: str = None, config: dict = {}, param: dict = {}, load: bool = False)
36    def __init__(
37        self,
38        conn=None,
39        input: str = None,
40        output: str = None,
41        config: dict = {},
42        param: dict = {},
43        load: bool = False,
44    ) -> None:
45        """
46        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
47        header
48
49        :param conn: the connection to the database
50        :param input: the input file
51        :param output: the output file
52        :param config: a dictionary containing the configuration of the model
53        :param param: a dictionary containing the parameters of the model
54        """
55
56        # Init variables
57        self.init_variables()
58
59        # Input
60        self.set_input(input)
61
62        # Config
63        self.set_config(config)
64
65        # Param
66        self.set_param(param)
67
68        # Output
69        self.set_output(output)
70
71        # connexion
72        self.set_connexion(conn)
73
74        # Header
75        self.set_header()
76
77        # Load data
78        if load:
79            self.load_data()

The function __init__ initializes the variables, sets the input, output, config, param, connexion and header

Parameters
  • conn: the connection to the database
  • input: the input file
  • output: the output file
  • config: a dictionary containing the configuration of the model
  • param: a dictionary containing the parameters of the model
def set_input(self, input: str = None) -> None:
 81    def set_input(self, input: str = None) -> None:
 82        """
 83        The function `set_input` takes a file name as input, extracts the name and extension, and sets
 84        attributes in the class accordingly.
 85
 86        :param input: The `set_input` method in the provided code snippet is used to set attributes
 87        related to the input file. Here's a breakdown of the parameters and their usage in the method:
 88        :type input: str
 89        """
 90
 91        if input and not isinstance(input, str):
 92            try:
 93                self.input = input.name
 94            except:
 95                log.error(f"Input file '{input} in bad format")
 96                raise ValueError(f"Input file '{input} in bad format")
 97        else:
 98            self.input = input
 99
100        # Input format
101        if input:
102            input_name, input_extension = os.path.splitext(self.input)
103            self.input_name = input_name
104            self.input_extension = input_extension
105            self.input_format = self.input_extension.replace(".", "")

The function set_input takes a file name as input, extracts the name and extension, and sets attributes in the class accordingly.

Parameters
  • input: The set_input method in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
def set_config(self, config: dict) -> None:
107    def set_config(self, config: dict) -> None:
108        """
109        The set_config function takes a config object and assigns it as the configuration object for the
110        class.
111
112        :param config: The `config` parameter in the `set_config` function is a dictionary object that
113        contains configuration settings for the class. When you call the `set_config` function with a
114        dictionary object as the argument, it will set that dictionary as the configuration object for
115        the class
116        :type config: dict
117        """
118
119        self.config = config

The set_config function takes a config object and assigns it as the configuration object for the class.

Parameters
  • config: The config parameter in the set_config function is a dictionary object that contains configuration settings for the class. When you call the set_config function with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
def set_param(self, param: dict) -> None:
121    def set_param(self, param: dict) -> None:
122        """
123        This function sets a parameter object for the class based on the input dictionary.
124
125        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
126        as the `param` attribute of the class instance
127        :type param: dict
128        """
129
130        self.param = param

This function sets a parameter object for the class based on the input dictionary.

Parameters
  • param: The set_param method you provided takes a dictionary object as input and sets it as the param attribute of the class instance
def init_variables(self) -> None:
132    def init_variables(self) -> None:
133        """
134        This function initializes the variables that will be used in the rest of the class
135        """
136
137        self.prefix = "howard"
138        self.table_variants = "variants"
139        self.dataframe = None
140
141        self.comparison_map = {
142            "gt": ">",
143            "gte": ">=",
144            "lt": "<",
145            "lte": "<=",
146            "equals": "=",
147            "contains": "SIMILAR TO",
148        }
149
150        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
151
152        self.code_type_map_to_sql = {
153            "Integer": "INTEGER",
154            "String": "VARCHAR",
155            "Float": "FLOAT",
156            "Flag": "VARCHAR",
157        }
158
159        self.index_additionnal_fields = []

This function initializes the variables that will be used in the rest of the class

def get_indexing(self) -> bool:
161    def get_indexing(self) -> bool:
162        """
163        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
164        returns False.
165        :return: The value of the indexing parameter.
166        """
167
168        return self.get_param().get("indexing", False)

It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.

Returns

The value of the indexing parameter.

def get_connexion_config(self) -> dict:
170    def get_connexion_config(self) -> dict:
171        """
172        The function `get_connexion_config` returns a dictionary containing the configuration for a
173        connection, including the number of threads and memory limit.
174        :return: a dictionary containing the configuration for the Connexion library.
175        """
176
177        # config
178        config = self.get_config()
179
180        # Connexion config
181        connexion_config = {}
182        threads = self.get_threads()
183
184        # Threads
185        if threads:
186            connexion_config["threads"] = threads
187
188        # Memory
189        # if config.get("memory", None):
190        #     connexion_config["memory_limit"] = config.get("memory")
191        if self.get_memory():
192            connexion_config["memory_limit"] = self.get_memory()
193
194        # Temporary directory
195        if config.get("tmp", None):
196            connexion_config["temp_directory"] = config.get("tmp")
197
198        # Access
199        if config.get("access", None):
200            access = config.get("access")
201            if access in ["RO"]:
202                access = "READ_ONLY"
203            elif access in ["RW"]:
204                access = "READ_WRITE"
205            connexion_db = self.get_connexion_db()
206            if connexion_db in ":memory:":
207                access = "READ_WRITE"
208            connexion_config["access_mode"] = access
209
210        return connexion_config

The function get_connexion_config returns a dictionary containing the configuration for a connection, including the number of threads and memory limit.

Returns

a dictionary containing the configuration for the Connexion library.

def get_duckdb_settings(self) -> dict:
212    def get_duckdb_settings(self) -> dict:
213        """
214        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
215        string.
216        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
217        """
218
219        # config
220        config = self.get_config()
221
222        # duckdb settings
223        duckdb_settings_dict = {}
224        if config.get("duckdb_settings", None):
225            duckdb_settings = config.get("duckdb_settings")
226            duckdb_settings = full_path(duckdb_settings)
227            # duckdb setting is a file
228            if os.path.exists(duckdb_settings):
229                with open(duckdb_settings) as json_file:
230                    duckdb_settings_dict = yaml.safe_load(json_file)
231            # duckdb settings is a string
232            else:
233                duckdb_settings_dict = json.loads(duckdb_settings)
234
235        return duckdb_settings_dict

The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a string.

Returns

The function get_duckdb_settings returns a dictionary object duckdb_settings_dict.

def set_connexion_db(self) -> str:
237    def set_connexion_db(self) -> str:
238        """
239        The function `set_connexion_db` returns the appropriate database connection string based on the
240        input format and connection type.
241        :return: the value of the variable `connexion_db`.
242        """
243
244        # Default connexion db
245        default_connexion_db = ":memory:"
246
247        # Find connexion db
248        if self.get_input_format() in ["db", "duckdb"]:
249            connexion_db = self.get_input()
250        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
251            connexion_db = default_connexion_db
252        elif self.get_connexion_type() in ["tmpfile"]:
253            tmp_name = tempfile.mkdtemp(
254                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
255            )
256            connexion_db = f"{tmp_name}/tmp.db"
257        elif self.get_connexion_type() != "":
258            connexion_db = self.get_connexion_type()
259        else:
260            connexion_db = default_connexion_db
261
262        # Set connexion db
263        self.connexion_db = connexion_db
264
265        return connexion_db

The function set_connexion_db returns the appropriate database connection string based on the input format and connection type.

Returns

the value of the variable connexion_db.

def set_connexion(self, conn) -> None:
267    def set_connexion(self, conn) -> None:
268        """
269        The function `set_connexion` creates a connection to a database, with options for different
270        database formats and settings.
271
272        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
273        database. If a connection is not provided, a new connection to an in-memory database is created.
274        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
275        sqlite
276        """
277
278        # Connexion db
279        connexion_db = self.set_connexion_db()
280
281        # Connexion config
282        connexion_config = self.get_connexion_config()
283
284        # Connexion format
285        connexion_format = self.get_config().get("connexion_format", "duckdb")
286        # Set connexion format
287        self.connexion_format = connexion_format
288
289        # Connexion
290        if not conn:
291            if connexion_format in ["duckdb"]:
292                conn = duckdb.connect(connexion_db, config=connexion_config)
293                # duckDB settings
294                duckdb_settings = self.get_duckdb_settings()
295                if duckdb_settings:
296                    for setting in duckdb_settings:
297                        setting_value = duckdb_settings.get(setting)
298                        if isinstance(setting_value, str):
299                            setting_value = f"'{setting_value}'"
300                        conn.execute(f"PRAGMA {setting}={setting_value};")
301            elif connexion_format in ["sqlite"]:
302                conn = sqlite3.connect(connexion_db)
303
304        # Set connexion
305        self.conn = conn
306
307        # Log
308        log.debug(f"connexion_format: {connexion_format}")
309        log.debug(f"connexion_db: {connexion_db}")
310        log.debug(f"connexion config: {connexion_config}")
311        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")

The function set_connexion creates a connection to a database, with options for different database formats and settings.

Parameters
  • conn: The conn parameter in the set_connexion method is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
def set_output(self, output: str = None) -> None:
313    def set_output(self, output: str = None) -> None:
314        """
315        The `set_output` function in Python sets the output file based on the input or a specified key
316        in the config file, extracting the output name, extension, and format.
317
318        :param output: The `output` parameter in the `set_output` method is used to specify the name of
319        the output file. If the config file has an 'output' key, the method sets the output to the value
320        of that key. If no output is provided, it sets the output to `None`
321        :type output: str
322        """
323
324        if output and not isinstance(output, str):
325            self.output = output.name
326        else:
327            self.output = output
328
329        # Output format
330        if self.output:
331            output_name, output_extension = os.path.splitext(self.output)
332            self.output_name = output_name
333            self.output_extension = output_extension
334            self.output_format = self.output_extension.replace(".", "")
335        else:
336            self.output_name = None
337            self.output_extension = None
338            self.output_format = None

The set_output function in Python sets the output file based on the input or a specified key in the config file, extracting the output name, extension, and format.

Parameters
  • output: The output parameter in the set_output method is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output to None
def set_header(self) -> None:
340    def set_header(self) -> None:
341        """
342        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
343        """
344
345        input_file = self.get_input()
346        default_header_list = [
347            "##fileformat=VCFv4.2",
348            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
349        ]
350
351        # Full path
352        input_file = full_path(input_file)
353
354        if input_file:
355
356            input_format = self.get_input_format()
357            input_compressed = self.get_input_compressed()
358            config = self.get_config()
359            header_list = default_header_list
360            if input_format in [
361                "vcf",
362                "hdr",
363                "tsv",
364                "csv",
365                "psv",
366                "parquet",
367                "db",
368                "duckdb",
369            ]:
370                # header provided in param
371                if config.get("header_file", None):
372                    with open(config.get("header_file"), "rt") as f:
373                        header_list = self.read_vcf_header(f)
374                # within a vcf file format (header within input file itsself)
375                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
376                    # within a compressed vcf file format (.vcf.gz)
377                    if input_compressed:
378                        with bgzf.open(input_file, "rt") as f:
379                            header_list = self.read_vcf_header(f)
380                    # within an uncompressed vcf file format (.vcf)
381                    else:
382                        with open(input_file, "rt") as f:
383                            header_list = self.read_vcf_header(f)
384                # header provided in default external file .hdr
385                elif os.path.exists((input_file + ".hdr")):
386                    with open(input_file + ".hdr", "rt") as f:
387                        header_list = self.read_vcf_header(f)
388                else:
389                    try:  # Try to get header info fields and file columns
390
391                        with tempfile.TemporaryDirectory() as tmpdir:
392
393                            # Create database
394                            db_for_header = Database(database=input_file)
395
396                            # Get header columns for infos fields
397                            db_header_from_columns = (
398                                db_for_header.get_header_from_columns()
399                            )
400
401                            # Get real columns in the file
402                            db_header_columns = db_for_header.get_columns()
403
404                            # Write header file
405                            header_file_tmp = os.path.join(tmpdir, "header")
406                            f = open(header_file_tmp, "w")
407                            vcf.Writer(f, db_header_from_columns)
408                            f.close()
409
410                            # Replace #CHROM line with rel columns
411                            header_list = db_for_header.read_header_file(
412                                header_file=header_file_tmp
413                            )
414                            header_list[-1] = "\t".join(db_header_columns)
415
416                    except:
417
418                        log.warning(
419                            f"No header for file {input_file}. Set as default VCF header"
420                        )
421                        header_list = default_header_list
422
423            else:  # try for unknown format ?
424
425                log.error(f"Input file format '{input_format}' not available")
426                raise ValueError(f"Input file format '{input_format}' not available")
427
428            if not header_list:
429                header_list = default_header_list
430
431            # header as list
432            self.header_list = header_list
433
434            # header as VCF object
435            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
436
437        else:
438
439            self.header_list = None
440            self.header_vcf = None

It reads the header of a VCF file and stores it as a list of strings and as a VCF object

def get_query_to_df(self, query: str = '', limit: int = None) -> pandas.core.frame.DataFrame:
442    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
443        """
444        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
445        DataFrame based on the connection format.
446
447        :param query: The `query` parameter in the `get_query_to_df` function is a string that
448        represents the SQL query you want to execute. This query will be used to fetch data from a
449        database and convert it into a pandas DataFrame
450        :type query: str
451        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
452        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
453        function will only fetch up to that number of rows from the database query result. If no limit
454        is specified,
455        :type limit: int
456        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
457        """
458
459        # Connexion format
460        connexion_format = self.get_connexion_format()
461
462        # Limit in query
463        if limit:
464            pd.set_option("display.max_rows", limit)
465            if connexion_format in ["duckdb"]:
466                df = (
467                    self.conn.execute(query)
468                    .fetch_record_batch(limit)
469                    .read_next_batch()
470                    .to_pandas()
471                )
472            elif connexion_format in ["sqlite"]:
473                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
474
475        # Full query
476        else:
477            if connexion_format in ["duckdb"]:
478                df = self.conn.execute(query).df()
479            elif connexion_format in ["sqlite"]:
480                df = pd.read_sql_query(query, self.conn)
481
482        return df

The get_query_to_df function takes a query as a string and returns the result as a pandas DataFrame based on the connection format.

Parameters
  • query: The query parameter in the get_query_to_df function is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame
  • limit: The limit parameter in the get_query_to_df function is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns

A pandas DataFrame is being returned by the get_query_to_df function.

def get_overview(self) -> None:
484    def get_overview(self) -> None:
485        """
486        The function prints the input, output, config, and dataframe of the current object
487        """
488        table_variants_from = self.get_table_variants(clause="from")
489        sql_columns = self.get_header_columns_as_sql()
490        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
491        df = self.get_query_to_df(sql_query_export)
492        log.info(
493            "Input:  "
494            + str(self.get_input())
495            + " ["
496            + str(str(self.get_input_format()))
497            + "]"
498        )
499        log.info(
500            "Output: "
501            + str(self.get_output())
502            + " ["
503            + str(str(self.get_output_format()))
504            + "]"
505        )
506        log.info("Config: ")
507        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
508            "\n"
509        ):
510            log.info("\t" + str(d))
511        log.info("Param: ")
512        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
513            "\n"
514        ):
515            log.info("\t" + str(d))
516        log.info("Sample list: " + str(self.get_header_sample_list()))
517        log.info("Dataframe: ")
518        for d in str(df).split("\n"):
519            log.info("\t" + str(d))
520
521        # garbage collector
522        del df
523        gc.collect()
524
525        return None

The function prints the input, output, config, and dataframe of the current object

def get_stats(self) -> dict:
527    def get_stats(self) -> dict:
528        """
529        The `get_stats` function calculates and returns various statistics of the current object,
530        including information about the input file, variants, samples, header fields, quality, and
531        SNVs/InDels.
532        :return: a dictionary containing various statistics of the current object. The dictionary has
533        the following structure:
534        """
535
536        # Log
537        log.info(f"Stats Calculation...")
538
539        # table varaints
540        table_variants_from = self.get_table_variants()
541
542        # stats dict
543        stats = {"Infos": {}}
544
545        ### File
546        input_file = self.get_input()
547        stats["Infos"]["Input file"] = input_file
548
549        # Header
550        header_infos = self.get_header().infos
551        header_formats = self.get_header().formats
552        header_infos_list = list(header_infos)
553        header_formats_list = list(header_formats)
554
555        ### Variants
556
557        stats["Variants"] = {}
558
559        # Variants by chr
560        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
561        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
562        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
563            by=["CHROM"], kind="quicksort"
564        )
565
566        # Total number of variants
567        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
568
569        # Calculate percentage
570        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
571            lambda x: (x / nb_of_variants)
572        )
573
574        stats["Variants"]["Number of variants by chromosome"] = (
575            nb_of_variants_by_chrom.to_dict(orient="index")
576        )
577
578        stats["Infos"]["Number of variants"] = int(nb_of_variants)
579
580        ### Samples
581
582        # Init
583        samples = {}
584        nb_of_samples = 0
585
586        # Check Samples
587        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
588            log.debug(f"Check samples...")
589            for sample in self.get_header_sample_list():
590                sql_query_samples = f"""
591                    SELECT  '{sample}' as sample,
592                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
593                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
594                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
595                    FROM {table_variants_from}
596                    WHERE (
597                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
598                        AND
599                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
600                      )
601                    GROUP BY genotype
602                    """
603                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
604                sample_genotype_count = sql_query_genotype_df["count"].sum()
605                if len(sql_query_genotype_df):
606                    nb_of_samples += 1
607                    samples[f"{sample} - {sample_genotype_count} variants"] = (
608                        sql_query_genotype_df.to_dict(orient="index")
609                    )
610
611            stats["Samples"] = samples
612            stats["Infos"]["Number of samples"] = nb_of_samples
613
614        # #
615        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
616        #     stats["Infos"]["Number of samples"] = nb_of_samples
617        # elif nb_of_samples:
618        #     stats["Infos"]["Number of samples"] = "not a VCF format"
619
620        ### INFO and FORMAT fields
621        header_types_df = {}
622        header_types_list = {
623            "List of INFO fields": header_infos,
624            "List of FORMAT fields": header_formats,
625        }
626        i = 0
627        for header_type in header_types_list:
628
629            header_type_infos = header_types_list.get(header_type)
630            header_infos_dict = {}
631
632            for info in header_type_infos:
633
634                i += 1
635                header_infos_dict[i] = {}
636
637                # ID
638                header_infos_dict[i]["id"] = info
639
640                # num
641                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
642                if header_type_infos[info].num in genotype_map.keys():
643                    header_infos_dict[i]["Number"] = genotype_map.get(
644                        header_type_infos[info].num
645                    )
646                else:
647                    header_infos_dict[i]["Number"] = header_type_infos[info].num
648
649                # type
650                if header_type_infos[info].type:
651                    header_infos_dict[i]["Type"] = header_type_infos[info].type
652                else:
653                    header_infos_dict[i]["Type"] = "."
654
655                # desc
656                if header_type_infos[info].desc != None:
657                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
658                else:
659                    header_infos_dict[i]["Description"] = ""
660
661            if len(header_infos_dict):
662                header_types_df[header_type] = pd.DataFrame.from_dict(
663                    header_infos_dict, orient="index"
664                ).to_dict(orient="index")
665
666        # Stats
667        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
668        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
669        stats["Header"] = header_types_df
670
671        ### QUAL
672        if "QUAL" in self.get_header_columns():
673            sql_query_qual = f"""
674                    SELECT
675                        avg(CAST(QUAL AS INTEGER)) AS Average,
676                        min(CAST(QUAL AS INTEGER)) AS Minimum,
677                        max(CAST(QUAL AS INTEGER)) AS Maximum,
678                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
679                        median(CAST(QUAL AS INTEGER)) AS Median,
680                        variance(CAST(QUAL AS INTEGER)) AS Variance
681                    FROM {table_variants_from}
682                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
683                    """
684
685            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
686            stats["Quality"] = {"Stats": qual}
687
688        ### SNV and InDel
689
690        sql_query_snv = f"""
691            
692            SELECT Type, count FROM (
693
694                    SELECT
695                        'Total' AS Type,
696                        count(*) AS count
697                    FROM {table_variants_from}
698
699                    UNION
700
701                    SELECT
702                        'MNV' AS Type,
703                        count(*) AS count
704                    FROM {table_variants_from}
705                    WHERE len(REF) > 1 AND len(ALT) > 1
706                    AND len(REF) = len(ALT)
707
708                    UNION
709
710                    SELECT
711                        'InDel' AS Type,
712                        count(*) AS count
713                    FROM {table_variants_from}
714                    WHERE len(REF) > 1 OR len(ALT) > 1
715                    AND len(REF) != len(ALT)
716                    
717                    UNION
718
719                    SELECT
720                        'SNV' AS Type,
721                        count(*) AS count
722                    FROM {table_variants_from}
723                    WHERE len(REF) = 1 AND len(ALT) = 1
724
725                )
726
727            ORDER BY count DESC
728
729                """
730        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
731
732        sql_query_snv_substitution = f"""
733                SELECT
734                    concat(REF, '>', ALT) AS 'Substitution',
735                    count(*) AS count
736                FROM {table_variants_from}
737                WHERE len(REF) = 1 AND len(ALT) = 1
738                GROUP BY REF, ALT
739                ORDER BY count(*) DESC
740                """
741        snv_substitution = (
742            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
743        )
744        stats["Variants"]["Counts"] = snv_indel
745        stats["Variants"]["Substitutions"] = snv_substitution
746
747        return stats

The get_stats function calculates and returns various statistics of the current object, including information about the input file, variants, samples, header fields, quality, and SNVs/InDels.

Returns

a dictionary containing various statistics of the current object. The dictionary has the following structure:

def stats_to_file(self, file: str = None) -> str:
749    def stats_to_file(self, file: str = None) -> str:
750        """
751        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
752        into a JSON object, and writes the JSON object to the specified file.
753
754        :param file: The `file` parameter is a string that represents the file path where the JSON data
755        will be written
756        :type file: str
757        :return: the name of the file that was written to.
758        """
759
760        # Get stats
761        stats = self.get_stats()
762
763        # Serializing json
764        json_object = json.dumps(stats, indent=4)
765
766        # Writing to sample.json
767        with open(file, "w") as outfile:
768            outfile.write(json_object)
769
770        return file

The function stats_to_file takes a file name as input, retrieves statistics, serializes them into a JSON object, and writes the JSON object to the specified file.

Parameters
  • file: The file parameter is a string that represents the file path where the JSON data will be written
Returns

the name of the file that was written to.

def print_stats(self, output_file: str = None, json_file: str = None) -> None:
772    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
773        """
774        The `print_stats` function generates a markdown file and prints the statistics contained in a
775        JSON file in a formatted manner.
776
777        :param output_file: The `output_file` parameter is a string that specifies the path and filename
778        of the output file where the stats will be printed in Markdown format. If no `output_file` is
779        provided, a temporary directory will be created and the stats will be saved in a file named
780        "stats.md" within that
781        :type output_file: str
782        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
783        file where the statistics will be saved. If no value is provided, a temporary directory will be
784        created and a default file name "stats.json" will be used
785        :type json_file: str
786        :return: The function `print_stats` does not return any value. It has a return type annotation
787        of `None`.
788        """
789
790        # Full path
791        output_file = full_path(output_file)
792        json_file = full_path(json_file)
793
794        with tempfile.TemporaryDirectory() as tmpdir:
795
796            # Files
797            if not output_file:
798                output_file = os.path.join(tmpdir, "stats.md")
799            if not json_file:
800                json_file = os.path.join(tmpdir, "stats.json")
801
802            # Create folders
803            if not os.path.exists(os.path.dirname(output_file)):
804                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
805            if not os.path.exists(os.path.dirname(json_file)):
806                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
807
808            # Create stats JSON file
809            stats_file = self.stats_to_file(file=json_file)
810
811            # Print stats file
812            with open(stats_file) as f:
813                stats = yaml.safe_load(f)
814
815            # Output
816            output_title = []
817            output_index = []
818            output = []
819
820            # Title
821            output_title.append("# HOWARD Stats")
822
823            # Index
824            output_index.append("## Index")
825
826            # Process sections
827            for section in stats:
828                infos = stats.get(section)
829                section_link = "#" + section.lower().replace(" ", "-")
830                output.append(f"## {section}")
831                output_index.append(f"- [{section}]({section_link})")
832
833                if len(infos):
834                    for info in infos:
835                        try:
836                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
837                            is_df = True
838                        except:
839                            try:
840                                df = pd.DataFrame.from_dict(
841                                    json.loads((infos.get(info))), orient="index"
842                                )
843                                is_df = True
844                            except:
845                                is_df = False
846                        if is_df:
847                            output.append(f"### {info}")
848                            info_link = "#" + info.lower().replace(" ", "-")
849                            output_index.append(f"   - [{info}]({info_link})")
850                            output.append(f"{df.to_markdown(index=False)}")
851                        else:
852                            output.append(f"- {info}: {infos.get(info)}")
853                else:
854                    output.append(f"NA")
855
856            # Write stats in markdown file
857            with open(output_file, "w") as fp:
858                for item in output_title:
859                    fp.write("%s\n" % item)
860                for item in output_index:
861                    fp.write("%s\n" % item)
862                for item in output:
863                    fp.write("%s\n" % item)
864
865            # Output stats in markdown
866            print("")
867            print("\n\n".join(output_title))
868            print("")
869            print("\n\n".join(output))
870            print("")
871
872        return None

The print_stats function generates a markdown file and prints the statistics contained in a JSON file in a formatted manner.

Parameters
  • output_file: The output_file parameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If no output_file is provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that
  • json_file: The json_file parameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns

The function print_stats does not return any value. It has a return type annotation of None.

def get_input(self) -> str:
874    def get_input(self) -> str:
875        """
876        It returns the value of the input variable.
877        :return: The input is being returned.
878        """
879        return self.input

It returns the value of the input variable.

Returns

The input is being returned.

def get_input_format(self, input_file: str = None) -> str:
881    def get_input_format(self, input_file: str = None) -> str:
882        """
883        This function returns the format of the input variable, either from the provided input file or
884        by prompting for input.
885
886        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
887        represents the file path of the input file. If no `input_file` is provided when calling the
888        method, it will default to `None`
889        :type input_file: str
890        :return: The format of the input variable is being returned.
891        """
892
893        if not input_file:
894            input_file = self.get_input()
895        input_format = get_file_format(input_file)
896        return input_format

This function returns the format of the input variable, either from the provided input file or by prompting for input.

Parameters
  • input_file: The input_file parameter in the get_input_format method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None
Returns

The format of the input variable is being returned.

def get_input_compressed(self, input_file: str = None) -> str:
898    def get_input_compressed(self, input_file: str = None) -> str:
899        """
900        The function `get_input_compressed` returns the format of the input variable after compressing
901        it.
902
903        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
904        that represents the file path of the input file. If no `input_file` is provided when calling the
905        method, it will default to `None` and the method will then call `self.get_input()` to
906        :type input_file: str
907        :return: The function `get_input_compressed` returns the compressed format of the input
908        variable.
909        """
910
911        if not input_file:
912            input_file = self.get_input()
913        input_compressed = get_file_compressed(input_file)
914        return input_compressed

The function get_input_compressed returns the format of the input variable after compressing it.

Parameters
  • input_file: The input_file parameter in the get_input_compressed method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None and the method will then call self.get_input() to
Returns

The function get_input_compressed returns the compressed format of the input variable.

def get_output(self) -> str:
916    def get_output(self) -> str:
917        """
918        It returns the output of the neuron.
919        :return: The output of the neural network.
920        """
921
922        return self.output

It returns the output of the neuron.

Returns

The output of the neural network.

def get_output_format(self, output_file: str = None) -> str:
924    def get_output_format(self, output_file: str = None) -> str:
925        """
926        The function `get_output_format` returns the format of the input variable or the output file if
927        provided.
928
929        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
930        that represents the file path of the output file. If no `output_file` is provided when calling
931        the method, it will default to the output obtained from the `get_output` method of the class
932        instance. The
933        :type output_file: str
934        :return: The format of the input variable is being returned.
935        """
936
937        if not output_file:
938            output_file = self.get_output()
939        output_format = get_file_format(output_file)
940
941        return output_format

The function get_output_format returns the format of the input variable or the output file if provided.

Parameters
  • output_file: The output_file parameter in the get_output_format method is a string that represents the file path of the output file. If no output_file is provided when calling the method, it will default to the output obtained from the get_output method of the class instance. The
Returns

The format of the input variable is being returned.

def get_config(self) -> dict:
943    def get_config(self) -> dict:
944        """
945        It returns the config
946        :return: The config variable is being returned.
947        """
948        return self.config

It returns the config

Returns

The config variable is being returned.

def get_param(self) -> dict:
950    def get_param(self) -> dict:
951        """
952        It returns the param
953        :return: The param variable is being returned.
954        """
955        return self.param

It returns the param

Returns

The param variable is being returned.

def get_connexion_db(self) -> str:
957    def get_connexion_db(self) -> str:
958        """
959        It returns the connexion_db attribute of the object
960        :return: The connexion_db is being returned.
961        """
962        return self.connexion_db

It returns the connexion_db attribute of the object

Returns

The connexion_db is being returned.

def get_prefix(self) -> str:
964    def get_prefix(self) -> str:
965        """
966        It returns the prefix of the object.
967        :return: The prefix is being returned.
968        """
969        return self.prefix

It returns the prefix of the object.

Returns

The prefix is being returned.

def get_table_variants(self, clause: str = 'select') -> str:
971    def get_table_variants(self, clause: str = "select") -> str:
972        """
973        This function returns the table_variants attribute of the object
974
975        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
976        defaults to select (optional)
977        :return: The table_variants attribute of the object.
978        """
979
980        # Access
981        access = self.get_config().get("access", None)
982
983        # Clauses "select", "where", "update"
984        if clause in ["select", "where", "update"]:
985            table_variants = self.table_variants
986        # Clause "from"
987        elif clause in ["from"]:
988            # For Read Only
989            if self.get_input_format() in ["parquet"] and access in ["RO"]:
990                input_file = self.get_input()
991                table_variants = f"'{input_file}' as variants"
992            # For Read Write
993            else:
994                table_variants = f"{self.table_variants} as variants"
995        else:
996            table_variants = self.table_variants
997        return table_variants

This function returns the table_variants attribute of the object

Parameters
  • clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns

The table_variants attribute of the object.

def get_tmp_dir(self) -> str:
 999    def get_tmp_dir(self) -> str:
1000        """
1001        The function `get_tmp_dir` returns the temporary directory path based on configuration
1002        parameters or a default path.
1003        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
1004        configuration, parameters, and a default value of "/tmp".
1005        """
1006
1007        return get_tmp(
1008            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
1009        )

The function get_tmp_dir returns the temporary directory path based on configuration parameters or a default path.

Returns

The get_tmp_dir method is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".

def get_connexion_type(self) -> str:
1011    def get_connexion_type(self) -> str:
1012        """
1013        If the connexion type is not in the list of allowed connexion types, raise a ValueError
1014
1015        :return: The connexion type is being returned.
1016        """
1017        return self.get_config().get("connexion_type", "memory")

If the connexion type is not in the list of allowed connexion types, raise a ValueError

Returns

The connexion type is being returned.

def get_connexion(self):
1019    def get_connexion(self):
1020        """
1021        It returns the connection object
1022
1023        :return: The connection object.
1024        """
1025        return self.conn

It returns the connection object

Returns

The connection object.

def close_connexion(self) -> None:
1027    def close_connexion(self) -> None:
1028        """
1029        This function closes the connection to the database.
1030        :return: The connection is being closed.
1031        """
1032        return self.conn.close()

This function closes the connection to the database.

Returns

The connection is being closed.

def get_header(self, type: str = 'vcf'):
1034    def get_header(self, type: str = "vcf"):
1035        """
1036        This function returns the header of the VCF file as a list of strings
1037
1038        :param type: the type of header you want to get, defaults to vcf (optional)
1039        :return: The header of the vcf file.
1040        """
1041
1042        if self.header_vcf:
1043            if type == "vcf":
1044                return self.header_vcf
1045            elif type == "list":
1046                return self.header_list
1047        else:
1048            if type == "vcf":
1049                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
1050                return header
1051            elif type == "list":
1052                return vcf_required

This function returns the header of the VCF file as a list of strings

Parameters
  • type: the type of header you want to get, defaults to vcf (optional)
Returns

The header of the vcf file.

def get_header_length(self, file: str = None) -> int:
1054    def get_header_length(self, file: str = None) -> int:
1055        """
1056        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1057        line.
1058
1059        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1060        header file. If this argument is provided, the function will read the header from the specified
1061        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1062        :type file: str
1063        :return: the length of the header list, excluding the #CHROM line.
1064        """
1065
1066        if file:
1067            return len(self.read_vcf_header_file(file=file)) - 1
1068        elif self.get_header(type="list"):
1069            return len(self.get_header(type="list")) - 1
1070        else:
1071            return 0

The function get_header_length returns the length of the header list, excluding the #CHROM line.

Parameters
  • file: The file parameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns

the length of the header list, excluding the #CHROM line.

def get_header_columns(self) -> str:
1073    def get_header_columns(self) -> str:
1074        """
1075        This function returns the header list of a VCF
1076
1077        :return: The length of the header list.
1078        """
1079        if self.get_header():
1080            return self.get_header(type="list")[-1]
1081        else:
1082            return ""

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_list(self) -> list:
1084    def get_header_columns_as_list(self) -> list:
1085        """
1086        This function returns the header list of a VCF
1087
1088        :return: The length of the header list.
1089        """
1090        if self.get_header():
1091            return self.get_header_columns().strip().split("\t")
1092        else:
1093            return []

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_sql(self) -> str:
1095    def get_header_columns_as_sql(self) -> str:
1096        """
1097        This function retruns header length (without #CHROM line)
1098
1099        :return: The length of the header list.
1100        """
1101        sql_column_list = []
1102        for col in self.get_header_columns_as_list():
1103            sql_column_list.append(f'"{col}"')
1104        return ",".join(sql_column_list)

This function retruns header length (without #CHROM line)

Returns

The length of the header list.

def get_header_sample_list(self) -> list:
1106    def get_header_sample_list(self) -> list:
1107        """
1108        This function retruns header length (without #CHROM line)
1109
1110        :return: The length of the header list.
1111        """
1112        return self.header_vcf.samples

This function retruns header length (without #CHROM line)

Returns

The length of the header list.

def get_verbose(self) -> bool:
1114    def get_verbose(self) -> bool:
1115        """
1116        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1117        exist
1118
1119        :return: The value of the key "verbose" in the config dictionary.
1120        """
1121        return self.get_config().get("verbose", False)

It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist

Returns

The value of the key "verbose" in the config dictionary.

def get_connexion_format(self) -> str:
1123    def get_connexion_format(self) -> str:
1124        """
1125        It returns the connexion format of the object.
1126        :return: The connexion_format is being returned.
1127        """
1128        connexion_format = self.connexion_format
1129        if connexion_format not in ["duckdb", "sqlite"]:
1130            log.error(f"Unknown connexion format {connexion_format}")
1131            raise ValueError(f"Unknown connexion format {connexion_format}")
1132        else:
1133            return connexion_format

It returns the connexion format of the object.

Returns

The connexion_format is being returned.

def insert_file_to_table( self, file, columns: str, header_len: int = 0, sep: str = '\t', chunksize: int = 1000000) -> None:
1135    def insert_file_to_table(
1136        self,
1137        file,
1138        columns: str,
1139        header_len: int = 0,
1140        sep: str = "\t",
1141        chunksize: int = 1000000,
1142    ) -> None:
1143        """
1144        The function reads a file in chunks and inserts each chunk into a table based on the specified
1145        database format.
1146
1147        :param file: The `file` parameter is the file that you want to load into a table. It should be
1148        the path to the file on your system
1149        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
1150        should contain the names of the columns in the table where the data will be inserted. The column
1151        names should be separated by commas within the string. For example, if you have columns named
1152        "id", "name
1153        :type columns: str
1154        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
1155        the number of lines to skip at the beginning of the file before reading the actual data. This
1156        parameter allows you to skip any header information present in the file before processing the
1157        data, defaults to 0
1158        :type header_len: int (optional)
1159        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
1160        separator character that is used in the file being read. In this case, the default separator is
1161        set to `\t`, which represents a tab character. You can change this parameter to a different
1162        separator character if, defaults to \t
1163        :type sep: str (optional)
1164        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
1165        when processing the file in chunks. In the provided code snippet, the default value for
1166        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
1167        to 1000000
1168        :type chunksize: int (optional)
1169        """
1170
1171        # Config
1172        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1173        connexion_format = self.get_connexion_format()
1174
1175        log.debug("chunksize: " + str(chunksize))
1176
1177        if chunksize:
1178            for chunk in pd.read_csv(
1179                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1180            ):
1181                if connexion_format in ["duckdb"]:
1182                    sql_insert_into = (
1183                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1184                    )
1185                    self.conn.execute(sql_insert_into)
1186                elif connexion_format in ["sqlite"]:
1187                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)

The function reads a file in chunks and inserts each chunk into a table based on the specified database format.

Parameters
  • file: The file parameter is the file that you want to load into a table. It should be the path to the file on your system
  • columns: The columns parameter in the insert_file_to_table function is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name
  • header_len: The header_len parameter in the insert_file_to_table function specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0
  • sep: The sep parameter in the insert_file_to_table function is used to specify the separator character that is used in the file being read. In this case, the default separator is set to , which represents a tab character. You can change this parameter to a different separator character if, defaults to
  • chunksize: The chunksize parameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value for chunksize is set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
def load_data( self, input_file: str = None, drop_variants_table: bool = False, sample_size: int = 20480) -> None:
1189    def load_data(
1190        self,
1191        input_file: str = None,
1192        drop_variants_table: bool = False,
1193        sample_size: int = 20480,
1194    ) -> None:
1195        """
1196        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1197        table before loading the data and specify a sample size.
1198
1199        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1200        table
1201        :type input_file: str
1202        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1203        determines whether the variants table should be dropped before loading the data. If set to
1204        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1205        not be dropped, defaults to False
1206        :type drop_variants_table: bool (optional)
1207        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1208        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1209        20480
1210        :type sample_size: int (optional)
1211        """
1212
1213        log.info("Loading...")
1214
1215        # change input file
1216        if input_file:
1217            self.set_input(input_file)
1218            self.set_header()
1219
1220        # drop variants table
1221        if drop_variants_table:
1222            self.drop_variants_table()
1223
1224        # get table variants
1225        table_variants = self.get_table_variants()
1226
1227        # Access
1228        access = self.get_config().get("access", None)
1229        log.debug(f"access: {access}")
1230
1231        # Input format and compress
1232        input_format = self.get_input_format()
1233        input_compressed = self.get_input_compressed()
1234        log.debug(f"input_format: {input_format}")
1235        log.debug(f"input_compressed: {input_compressed}")
1236
1237        # input_compressed_format
1238        if input_compressed:
1239            input_compressed_format = "gzip"
1240        else:
1241            input_compressed_format = "none"
1242        log.debug(f"input_compressed_format: {input_compressed_format}")
1243
1244        # Connexion format
1245        connexion_format = self.get_connexion_format()
1246
1247        # Sample size
1248        if not sample_size:
1249            sample_size = -1
1250        log.debug(f"sample_size: {sample_size}")
1251
1252        # Load data
1253        log.debug(f"Load Data from {input_format}")
1254
1255        # DuckDB connexion
1256        if connexion_format in ["duckdb"]:
1257
1258            # Database already exists
1259            if self.input_format in ["db", "duckdb"]:
1260
1261                if connexion_format in ["duckdb"]:
1262                    log.debug(f"Input file format '{self.input_format}' duckDB")
1263                else:
1264                    log.error(
1265                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1266                    )
1267                    raise ValueError(
1268                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1269                    )
1270
1271            # Load from existing database format
1272            else:
1273
1274                try:
1275                    # Create Table or View
1276                    database = Database(database=self.input)
1277                    sql_from = database.get_sql_from(sample_size=sample_size)
1278
1279                    if access in ["RO"]:
1280                        sql_load = (
1281                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1282                        )
1283                    else:
1284                        sql_load = (
1285                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1286                        )
1287                    self.conn.execute(sql_load)
1288
1289                except:
1290                    # Format not available
1291                    log.error(f"Input file format '{self.input_format}' not available")
1292                    raise ValueError(
1293                        f"Input file format '{self.input_format}' not available"
1294                    )
1295
1296        # SQLite connexion
1297        elif connexion_format in ["sqlite"] and input_format in [
1298            "vcf",
1299            "tsv",
1300            "csv",
1301            "psv",
1302        ]:
1303
1304            # Main structure
1305            structure = {
1306                "#CHROM": "VARCHAR",
1307                "POS": "INTEGER",
1308                "ID": "VARCHAR",
1309                "REF": "VARCHAR",
1310                "ALT": "VARCHAR",
1311                "QUAL": "VARCHAR",
1312                "FILTER": "VARCHAR",
1313                "INFO": "VARCHAR",
1314            }
1315
1316            # Strcuture with samples
1317            structure_complete = structure
1318            if self.get_header_sample_list():
1319                structure["FORMAT"] = "VARCHAR"
1320                for sample in self.get_header_sample_list():
1321                    structure_complete[sample] = "VARCHAR"
1322
1323            # Columns list for create and insert
1324            sql_create_table_columns = []
1325            sql_create_table_columns_list = []
1326            for column in structure_complete:
1327                column_type = structure_complete[column]
1328                sql_create_table_columns.append(
1329                    f'"{column}" {column_type} default NULL'
1330                )
1331                sql_create_table_columns_list.append(f'"{column}"')
1332
1333            # Create database
1334            log.debug(f"Create Table {table_variants}")
1335            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1336            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1337            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1338            self.conn.execute(sql_create_table)
1339
1340            # chunksize define length of file chunk load file
1341            chunksize = 100000
1342
1343            # delimiter
1344            delimiter = file_format_delimiters.get(input_format, "\t")
1345
1346            # Load the input file
1347            with open(self.input, "rt") as input_file:
1348
1349                # Use the appropriate file handler based on the input format
1350                if input_compressed:
1351                    input_file = bgzf.open(self.input, "rt")
1352                if input_format in ["vcf"]:
1353                    header_len = self.get_header_length()
1354                else:
1355                    header_len = 0
1356
1357                # Insert the file contents into a table
1358                self.insert_file_to_table(
1359                    input_file,
1360                    columns=sql_create_table_columns_list_sql,
1361                    header_len=header_len,
1362                    sep=delimiter,
1363                    chunksize=chunksize,
1364                )
1365
1366        else:
1367            log.error(
1368                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1369            )
1370            raise ValueError(
1371                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1372            )
1373
1374        # Explode INFOS fields into table fields
1375        if self.get_explode_infos():
1376            self.explode_infos(
1377                prefix=self.get_explode_infos_prefix(),
1378                fields=self.get_explode_infos_fields(),
1379                force=True,
1380            )
1381
1382        # Create index after insertion
1383        self.create_indexes()

The load_data function reads a VCF file and inserts it into a table, with options to drop the table before loading the data and specify a sample size.

Parameters
  • input_file: The path to the input file. This is the VCF file that will be loaded into the table
  • drop_variants_table: The drop_variants_table parameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set to True, the variants table will be dropped. If set to False (default), the variants table will not be dropped, defaults to False
  • sample_size: The sample_size parameter determines the number of rows to be sampled from the input file. If it is set to None, the default value of 20480 will be used, defaults to 20480
def get_explode_infos(self) -> bool:
1385    def get_explode_infos(self) -> bool:
1386        """
1387        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1388        to False if it is not set.
1389        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1390        value. If the parameter is not present, it will return False.
1391        """
1392
1393        return self.get_param().get("explode", {}).get("explode_infos", False)

The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting to False if it is not set.

Returns

The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.

def get_explode_infos_fields( self, explode_infos_fields: str = None, remove_fields_not_in_header: bool = False) -> list:
1395    def get_explode_infos_fields(
1396        self,
1397        explode_infos_fields: str = None,
1398        remove_fields_not_in_header: bool = False,
1399    ) -> list:
1400        """
1401        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1402        the input parameter `explode_infos_fields`.
1403
1404        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1405        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1406        comma-separated list of field names to explode
1407        :type explode_infos_fields: str
1408        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1409        flag that determines whether to remove fields that are not present in the header. If it is set
1410        to `True`, any field that is not in the header will be excluded from the list of exploded
1411        information fields. If it is set to `, defaults to False
1412        :type remove_fields_not_in_header: bool (optional)
1413        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1414        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1415        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1416        Otherwise, it returns a list of exploded information fields after removing any spaces and
1417        splitting the string by commas.
1418        """
1419
1420        # If no fields, get it in param
1421        if not explode_infos_fields:
1422            explode_infos_fields = (
1423                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1424            )
1425
1426        # If no fields, defined as all fields in header using keyword
1427        if not explode_infos_fields:
1428            explode_infos_fields = "*"
1429
1430        # If fields list not empty
1431        if explode_infos_fields:
1432
1433            # Input fields list
1434            if isinstance(explode_infos_fields, str):
1435                fields_input = explode_infos_fields.split(",")
1436            elif isinstance(explode_infos_fields, list):
1437                fields_input = explode_infos_fields
1438            else:
1439                fields_input = []
1440
1441            # Fields list without * keyword
1442            fields_without_all = fields_input.copy()
1443            if "*".casefold() in (item.casefold() for item in fields_without_all):
1444                fields_without_all.remove("*")
1445
1446            # Fields in header
1447            fields_in_header = sorted(list(set(self.get_header().infos)))
1448
1449            # Construct list of fields
1450            fields_output = []
1451            for field in fields_input:
1452
1453                # Strip field
1454                field = field.strip()
1455
1456                # format keyword * in regex
1457                if field.upper() in ["*"]:
1458                    field = ".*"
1459
1460                # Find all fields with pattern
1461                r = re.compile(field)
1462                fields_search = sorted(list(filter(r.match, fields_in_header)))
1463
1464                # Remove fields input from search
1465                if field in fields_search:
1466                    fields_search = [field]
1467                elif fields_search != [field]:
1468                    fields_search = sorted(
1469                        list(set(fields_search).difference(fields_input))
1470                    )
1471
1472                # If field is not in header (avoid not well formatted header)
1473                if not fields_search and not remove_fields_not_in_header:
1474                    fields_search = [field]
1475
1476                # Add found fields
1477                for new_field in fields_search:
1478                    # Add field, if not already exists, and if it is in header (if asked)
1479                    if (
1480                        new_field not in fields_output
1481                        and (
1482                            not remove_fields_not_in_header
1483                            or new_field in fields_in_header
1484                        )
1485                        and new_field not in [".*"]
1486                    ):
1487                        fields_output.append(new_field)
1488
1489            return fields_output
1490
1491        else:
1492
1493            return []

The get_explode_infos_fields function returns a list of exploded information fields based on the input parameter explode_infos_fields.

Parameters
  • explode_infos_fields: The explode_infos_fields parameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode
  • remove_fields_not_in_header: The parameter remove_fields_not_in_header is a boolean flag that determines whether to remove fields that are not present in the header. If it is set to True, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns

The function get_explode_infos_fields returns a list of exploded information fields. If the explode_infos_fields parameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.

def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1495    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1496        """
1497        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1498        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1499        not provided.
1500
1501        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1502        prefix to be used for exploding or expanding information
1503        :type explode_infos_prefix: str
1504        :return: the value of the variable `explode_infos_prefix`.
1505        """
1506
1507        if not explode_infos_prefix:
1508            explode_infos_prefix = (
1509                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1510            )
1511
1512        return explode_infos_prefix

The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is not provided.

Parameters
  • explode_infos_prefix: The parameter explode_infos_prefix is a string that specifies a prefix to be used for exploding or expanding information
Returns

the value of the variable explode_infos_prefix.

def add_column( self, table_name, column_name, column_type, default_value=None, drop: bool = False) -> dict:
1514    def add_column(
1515        self,
1516        table_name,
1517        column_name,
1518        column_type,
1519        default_value=None,
1520        drop: bool = False,
1521    ) -> dict:
1522        """
1523        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1524        doesn't already exist.
1525
1526        :param table_name: The name of the table to which you want to add a column
1527        :param column_name: The parameter "column_name" is the name of the column that you want to add
1528        to the table
1529        :param column_type: The `column_type` parameter specifies the data type of the column that you
1530        want to add to the table. It should be a string that represents the desired data type, such as
1531        "INTEGER", "TEXT", "REAL", etc
1532        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1533        default value for the newly added column. If a default value is provided, it will be assigned to
1534        the column for any existing rows that do not have a value for that column
1535        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1536        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1537        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1538        to False
1539        :type drop: bool (optional)
1540        :return: a boolean value indicating whether the column was successfully added to the table.
1541        """
1542
1543        # added
1544        added = False
1545        dropped = False
1546
1547        # Check if the column already exists in the table
1548        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1549        columns = self.get_query_to_df(query).columns.tolist()
1550        if column_name.upper() in [c.upper() for c in columns]:
1551            log.debug(
1552                f"The {column_name} column already exists in the {table_name} table"
1553            )
1554            if drop:
1555                self.drop_column(table_name=table_name, column_name=column_name)
1556                dropped = True
1557            else:
1558                return None
1559        else:
1560            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1561
1562        # Add column in table
1563        add_column_query = (
1564            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1565        )
1566        if default_value is not None:
1567            add_column_query += f" DEFAULT {default_value}"
1568        self.execute_query(add_column_query)
1569        added = not dropped
1570        log.debug(
1571            f"The {column_name} column was successfully added to the {table_name} table"
1572        )
1573
1574        if added:
1575            added_column = {
1576                "table_name": table_name,
1577                "column_name": column_name,
1578                "column_type": column_type,
1579                "default_value": default_value,
1580            }
1581        else:
1582            added_column = None
1583
1584        return added_column

The add_column function adds a column to a SQLite or DuckDB table with a default value if it doesn't already exist.

Parameters
  • table_name: The name of the table to which you want to add a column
  • column_name: The parameter "column_name" is the name of the column that you want to add to the table
  • column_type: The column_type parameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc
  • default_value: The default_value parameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column
  • drop: The drop parameter is a boolean flag that determines whether to drop the column if it already exists in the table. If drop is set to True, the function will drop the existing column before adding the new column. If drop is set to False (default),, defaults to False
Returns

a boolean value indicating whether the column was successfully added to the table.

def drop_column( self, column: dict = None, table_name: str = None, column_name: str = None) -> bool:
1586    def drop_column(
1587        self, column: dict = None, table_name: str = None, column_name: str = None
1588    ) -> bool:
1589        """
1590        The `drop_column` function drops a specified column from a given table in a database and returns
1591        True if the column was successfully dropped, and False if the column does not exist in the
1592        table.
1593
1594        :param column: The `column` parameter is a dictionary that contains information about the column
1595        you want to drop. It has two keys:
1596        :type column: dict
1597        :param table_name: The `table_name` parameter is the name of the table from which you want to
1598        drop a column
1599        :type table_name: str
1600        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1601        from the table
1602        :type column_name: str
1603        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1604        and False if the column does not exist in the table.
1605        """
1606
1607        # Find column infos
1608        if column:
1609            if isinstance(column, dict):
1610                table_name = column.get("table_name", None)
1611                column_name = column.get("column_name", None)
1612            elif isinstance(column, str):
1613                table_name = self.get_table_variants()
1614                column_name = column
1615            else:
1616                table_name = None
1617                column_name = None
1618
1619        if not table_name and not column_name:
1620            return False
1621
1622        # Removed
1623        removed = False
1624
1625        # Check if the column already exists in the table
1626        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1627        columns = self.get_query_to_df(query).columns.tolist()
1628        if column_name in columns:
1629            log.debug(f"The {column_name} column exists in the {table_name} table")
1630        else:
1631            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1632            return False
1633
1634        # Add column in table # ALTER TABLE integers DROP k
1635        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1636        self.execute_query(add_column_query)
1637        removed = True
1638        log.debug(
1639            f"The {column_name} column was successfully dropped to the {table_name} table"
1640        )
1641
1642        return removed

The drop_column function drops a specified column from a given table in a database and returns True if the column was successfully dropped, and False if the column does not exist in the table.

Parameters
  • column: The column parameter is a dictionary that contains information about the column you want to drop. It has two keys:
  • table_name: The table_name parameter is the name of the table from which you want to drop a column
  • column_name: The column_name parameter is the name of the column that you want to drop from the table
Returns

a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.

def explode_infos( self, prefix: str = None, create_index: bool = False, fields: list = None, force: bool = False, proccess_all_fields_together: bool = False, table: str = None) -> list:
1644    def explode_infos(
1645        self,
1646        prefix: str = None,
1647        create_index: bool = False,
1648        fields: list = None,
1649        force: bool = False,
1650        proccess_all_fields_together: bool = False,
1651        table: str = None,
1652    ) -> list:
1653        """
1654        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
1655        individual columns, returning a list of added columns.
1656
1657        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1658        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1659        `self.get_explode_infos_prefix()` as the prefix
1660        :type prefix: str
1661        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1662        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1663        `False`, indexes will not be created. The default value is `False`, defaults to False
1664        :type create_index: bool (optional)
1665        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
1666        that you want to explode into individual columns. If this parameter is not provided, all INFO
1667        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
1668        a list to the `
1669        :type fields: list
1670        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
1671        determines whether to drop and recreate a column if it already exists in the table. If `force`
1672        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
1673        defaults to False
1674        :type force: bool (optional)
1675        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1676        flag that determines whether to process all the INFO fields together or individually. If set to
1677        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1678        be processed individually. The default value is, defaults to False
1679        :type proccess_all_fields_together: bool (optional)
1680        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
1681        of the table where the exploded INFO fields will be added as individual columns. If you provide
1682        a value for the `table` parameter, the function will use that table name. If the `table`
1683        parameter is
1684        :type table: str
1685        :return: The `explode_infos` function returns a list of added columns.
1686        """
1687
1688        # drop indexes
1689        self.drop_indexes()
1690
1691        # connexion format
1692        connexion_format = self.get_connexion_format()
1693
1694        # Access
1695        access = self.get_config().get("access", None)
1696
1697        # Added columns
1698        added_columns = []
1699
1700        if access not in ["RO"]:
1701
1702            # prefix
1703            if prefix in [None, True] or not isinstance(prefix, str):
1704                if self.get_explode_infos_prefix() not in [None, True]:
1705                    prefix = self.get_explode_infos_prefix()
1706                else:
1707                    prefix = "INFO/"
1708
1709            # table variants
1710            if table is not None:
1711                table_variants = table
1712            else:
1713                table_variants = self.get_table_variants(clause="select")
1714
1715            # extra infos
1716            try:
1717                extra_infos = self.get_extra_infos()
1718            except:
1719                extra_infos = []
1720
1721            # Header infos
1722            header_infos = self.get_header().infos
1723
1724            log.debug(
1725                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1726            )
1727
1728            sql_info_alter_table_array = []
1729
1730            # Info fields to check
1731            fields_list = list(header_infos)
1732            if fields:
1733                fields_list += fields
1734            fields_list = set(fields_list)
1735
1736            # If no fields
1737            if not fields:
1738                fields = []
1739
1740            # Translate fields if patterns
1741            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1742
1743            for info in fields:
1744
1745                info_id_sql = prefix + info
1746
1747                if (
1748                    info in fields_list
1749                    or prefix + info in fields_list
1750                    or info in extra_infos
1751                ):
1752
1753                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1754
1755                    if info in header_infos:
1756                        info_type = header_infos[info].type
1757                        info_num = header_infos[info].num
1758                    else:
1759                        info_type = "String"
1760                        info_num = 0
1761
1762                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1763                    if info_num != 1:
1764                        type_sql = "VARCHAR"
1765
1766                    # Add field
1767                    added_column = self.add_column(
1768                        table_name=table_variants,
1769                        column_name=info_id_sql,
1770                        column_type=type_sql,
1771                        default_value="null",
1772                        drop=force,
1773                    )
1774
1775                    if added_column:
1776                        added_columns.append(added_column)
1777
1778                    if added_column or force:
1779
1780                        # add field to index
1781                        self.index_additionnal_fields.append(info_id_sql)
1782
1783                        # Update field array
1784                        if connexion_format in ["duckdb"]:
1785                            update_info_field = f"""
1786                            "{info_id_sql}" =
1787                                CASE
1788                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1789                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1790                                END
1791                            """
1792                        elif connexion_format in ["sqlite"]:
1793                            update_info_field = f"""
1794                                "{info_id_sql}" =
1795                                    CASE
1796                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1797                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1798                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1799                                    END
1800                            """
1801
1802                        sql_info_alter_table_array.append(update_info_field)
1803
1804            if sql_info_alter_table_array:
1805
1806                # By chromosomes
1807                try:
1808                    chromosomes_list = list(
1809                        self.get_query_to_df(
1810                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1811                        )["#CHROM"]
1812                    )
1813                except:
1814                    chromosomes_list = [None]
1815
1816                for chrom in chromosomes_list:
1817                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1818
1819                    # Where clause
1820                    where_clause = ""
1821                    if chrom and len(chromosomes_list) > 1:
1822                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1823
1824                    # Update table
1825                    if proccess_all_fields_together:
1826                        sql_info_alter_table_array_join = ", ".join(
1827                            sql_info_alter_table_array
1828                        )
1829                        if sql_info_alter_table_array_join:
1830                            sql_info_alter_table = f"""
1831                                UPDATE {table_variants}
1832                                SET {sql_info_alter_table_array_join}
1833                                {where_clause}
1834                                """
1835                            log.debug(
1836                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1837                            )
1838                            # log.debug(sql_info_alter_table)
1839                            self.conn.execute(sql_info_alter_table)
1840                    else:
1841                        sql_info_alter_num = 0
1842                        for sql_info_alter in sql_info_alter_table_array:
1843                            sql_info_alter_num += 1
1844                            sql_info_alter_table = f"""
1845                                UPDATE {table_variants}
1846                                SET {sql_info_alter}
1847                                {where_clause}
1848                                """
1849                            log.debug(
1850                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1851                            )
1852                            # log.debug(sql_info_alter_table)
1853                            self.conn.execute(sql_info_alter_table)
1854
1855        # create indexes
1856        if create_index:
1857            self.create_indexes()
1858
1859        return added_columns

The explode_infos function in Python takes a VCF file and explodes the INFO fields into individual columns, returning a list of added columns.

Parameters
  • prefix: The prefix parameter is a string that is used as a prefix for the exploded INFO fields. If the prefix is not provided or is set to None, the function will use the value of self.get_explode_infos_prefix() as the prefix
  • create_index: The create_index parameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set to True, indexes will be created; if set to False, indexes will not be created. The default value is False, defaults to False
  • fields: The fields parameter in the explode_infos function is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the `
  • force: The force parameter in the explode_infos function is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. If force is set to True, the column will be dropped and recreated. If force is set to `False, defaults to False
  • proccess_all_fields_together: The proccess_all_fields_together parameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set to True, all the INFO fields will be processed together. If set to False, each INFO field will be processed individually. The default value is, defaults to False
  • table: The table parameter in the explode_infos function is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for the table parameter, the function will use that table name. If the table parameter is
Returns

The explode_infos function returns a list of added columns.

def create_indexes(self) -> None:
1861    def create_indexes(self) -> None:
1862        """
1863        Create indexes on the table after insertion
1864        """
1865
1866        # Access
1867        access = self.get_config().get("access", None)
1868
1869        # get table variants
1870        table_variants = self.get_table_variants("FROM")
1871
1872        if self.get_indexing() and access not in ["RO"]:
1873            # Create index
1874            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
1875            self.conn.execute(sql_create_table_index)
1876            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
1877            self.conn.execute(sql_create_table_index)
1878            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
1879            self.conn.execute(sql_create_table_index)
1880            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
1881            self.conn.execute(sql_create_table_index)
1882            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
1883            self.conn.execute(sql_create_table_index)
1884            for field in self.index_additionnal_fields:
1885                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
1886                self.conn.execute(sql_create_table_index)

Create indexes on the table after insertion

def drop_indexes(self) -> None:
1888    def drop_indexes(self) -> None:
1889        """
1890        Create indexes on the table after insertion
1891        """
1892
1893        # Access
1894        access = self.get_config().get("access", None)
1895
1896        # get table variants
1897        table_variants = self.get_table_variants("FROM")
1898
1899        # Get database format
1900        connexion_format = self.get_connexion_format()
1901
1902        if access not in ["RO"]:
1903            if connexion_format in ["duckdb"]:
1904                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
1905            elif connexion_format in ["sqlite"]:
1906                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
1907
1908            list_indexes = self.conn.execute(sql_list_indexes)
1909            index_names = [row[0] for row in list_indexes.fetchall()]
1910            for index in index_names:
1911                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
1912                self.conn.execute(sql_drop_table_index)

Create indexes on the table after insertion

def read_vcf_header(self, f) -> list:
1914    def read_vcf_header(self, f) -> list:
1915        """
1916        It reads the header of a VCF file and returns a list of the header lines
1917
1918        :param f: the file object
1919        :return: The header lines of the VCF file.
1920        """
1921
1922        header_list = []
1923        for line in f:
1924            header_list.append(line)
1925            if line.startswith("#CHROM"):
1926                break
1927        return header_list

It reads the header of a VCF file and returns a list of the header lines

Parameters
  • f: the file object
Returns

The header lines of the VCF file.

def read_vcf_header_file(self, file: str = None) -> list:
1929    def read_vcf_header_file(self, file: str = None) -> list:
1930        """
1931        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
1932        uncompressed files.
1933
1934        :param file: The `file` parameter is a string that represents the path to the VCF header file
1935        that you want to read. It is an optional parameter, so if you don't provide a value, it will
1936        default to `None`
1937        :type file: str
1938        :return: The function `read_vcf_header_file` returns a list.
1939        """
1940
1941        if self.get_input_compressed(input_file=file):
1942            with bgzf.open(file, "rt") as f:
1943                return self.read_vcf_header(f=f)
1944        else:
1945            with open(file, "rt") as f:
1946                return self.read_vcf_header(f=f)

The read_vcf_header_file function reads the header of a VCF file, handling both compressed and uncompressed files.

Parameters
  • file: The file parameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default to None
Returns

The function read_vcf_header_file returns a list.

def execute_query(self, query: str):
1948    def execute_query(self, query: str):
1949        """
1950        It takes a query as an argument, executes it, and returns the results
1951
1952        :param query: The query to be executed
1953        :return: The result of the query is being returned.
1954        """
1955        if query:
1956            return self.conn.execute(query)  # .fetchall()
1957        else:
1958            return None

It takes a query as an argument, executes it, and returns the results

Parameters
  • query: The query to be executed
Returns

The result of the query is being returned.

def export_output( self, output_file: str | None = None, output_header: str | None = None, export_header: bool = True, query: str | None = None, parquet_partitions: list | None = None, chunk_size: int | None = None, threads: int | None = None, sort: bool = False, index: bool = False, order_by: str | None = None) -> bool:
1960    def export_output(
1961        self,
1962        output_file: str | None = None,
1963        output_header: str | None = None,
1964        export_header: bool = True,
1965        query: str | None = None,
1966        parquet_partitions: list | None = None,
1967        chunk_size: int | None = None,
1968        threads: int | None = None,
1969        sort: bool = False,
1970        index: bool = False,
1971        order_by: str | None = None,
1972    ) -> bool:
1973        """
1974        The `export_output` function exports data from a VCF file to a specified output file in various
1975        formats, including VCF, CSV, TSV, PSV, and Parquet.
1976
1977        :param output_file: The `output_file` parameter is a string that specifies the name of the
1978        output file to be generated by the function. This is where the exported data will be saved
1979        :type output_file: str
1980        :param output_header: The `output_header` parameter is a string that specifies the name of the
1981        file where the header of the VCF file will be exported. If this parameter is not provided, the
1982        header will be exported to a file with the same name as the `output_file` parameter, but with
1983        the extension "
1984        :type output_header: str
1985        :param export_header: The `export_header` parameter is a boolean flag that determines whether
1986        the header of a VCF file should be exported to a separate file or not. If `export_header` is
1987        True, the header will be exported to a file. If `export_header` is False, the header will not
1988        be, defaults to True, if output format is not VCF
1989        :type export_header: bool (optional)
1990        :param query: The `query` parameter is an optional SQL query that can be used to filter and
1991        select specific data from the VCF file before exporting it. If provided, only the data that
1992        matches the query will be exported
1993        :type query: str
1994        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
1995        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
1996        organize data in a hierarchical directory structure based on the values of one or more columns.
1997        This can improve query performance when working with large datasets
1998        :type parquet_partitions: list
1999        :param chunk_size: The `chunk_size` parameter specifies the number of
2000        records in batch when exporting data in Parquet format. This parameter is used for
2001        partitioning the Parquet file into multiple files.
2002        :type chunk_size: int
2003        :param threads: The `threads` parameter is an optional parameter that specifies the number of
2004        threads to be used during the export process. It determines the level of parallelism and can
2005        improve the performance of the export operation. If not provided, the function will use the
2006        default number of threads
2007        :type threads: int
2008        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
2009        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
2010        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
2011        False
2012        :type sort: bool (optional)
2013        :param index: The `index` parameter is a boolean flag that determines whether an index should be
2014        created on the output file. If `index` is True, an index will be created. If `index` is False,
2015        no index will be created. The default value is False, defaults to False
2016        :type index: bool (optional)
2017        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
2018        sorting the output file. This parameter is only applicable when exporting data in VCF format
2019        :type order_by: str
2020        :return: a boolean value. It checks if the output file exists and returns True if it does, or
2021        None if it doesn't.
2022        """
2023
2024        # Log
2025        log.info("Exporting...")
2026
2027        # Full path
2028        output_file = full_path(output_file)
2029        output_header = full_path(output_header)
2030
2031        # Config
2032        config = self.get_config()
2033
2034        # Param
2035        param = self.get_param()
2036
2037        # Tmp files to remove
2038        tmp_to_remove = []
2039
2040        # If no output, get it
2041        if not output_file:
2042            output_file = self.get_output()
2043
2044        # If not threads
2045        if not threads:
2046            threads = self.get_threads()
2047
2048        # Auto header name with extension
2049        if export_header or output_header:
2050            if not output_header:
2051                output_header = f"{output_file}.hdr"
2052            # Export header
2053            self.export_header(output_file=output_file)
2054
2055        # Switch off export header if VCF output
2056        output_file_type = get_file_format(output_file)
2057        if output_file_type in ["vcf"]:
2058            export_header = False
2059            tmp_to_remove.append(output_header)
2060
2061        # Chunk size
2062        if not chunk_size:
2063            chunk_size = config.get("chunk_size", None)
2064
2065        # Parquet partition
2066        if not parquet_partitions:
2067            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
2068        if parquet_partitions and isinstance(parquet_partitions, str):
2069            parquet_partitions = parquet_partitions.split(",")
2070
2071        # Order by
2072        if not order_by:
2073            order_by = param.get("export", {}).get("order_by", "")
2074
2075        # Header in output
2076        header_in_output = param.get("export", {}).get("include_header", False)
2077
2078        # Database
2079        database_source = self.get_connexion()
2080
2081        # Connexion format
2082        connexion_format = self.get_connexion_format()
2083
2084        # Explode infos
2085        if self.get_explode_infos():
2086            self.explode_infos(
2087                prefix=self.get_explode_infos_prefix(),
2088                fields=self.get_explode_infos_fields(),
2089                force=False,
2090            )
2091
2092        # if connexion_format in ["sqlite"] or query:
2093        if connexion_format in ["sqlite"]:
2094
2095            # Export in Parquet
2096            random_tmp = "".join(
2097                random.choice(string.ascii_lowercase) for i in range(10)
2098            )
2099            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2100            tmp_to_remove.append(database_source)
2101
2102            # Table Variants
2103            table_variants = self.get_table_variants()
2104
2105            # Create export query
2106            sql_query_export_subquery = f"""
2107                SELECT * FROM {table_variants}
2108                """
2109
2110            # Write source file
2111            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2112
2113        # Create database
2114        database = Database(
2115            database=database_source,
2116            table="variants",
2117            header_file=output_header,
2118            conn_config=self.get_connexion_config(),
2119        )
2120
2121        # Existing colomns header
2122        # existing_columns_header = database.get_header_file_columns(output_header)
2123        existing_columns_header = database.get_header_columns_from_database()
2124
2125        # Export file
2126        database.export(
2127            output_database=output_file,
2128            output_header=output_header,
2129            existing_columns_header=existing_columns_header,
2130            parquet_partitions=parquet_partitions,
2131            chunk_size=chunk_size,
2132            threads=threads,
2133            sort=sort,
2134            index=index,
2135            header_in_output=header_in_output,
2136            order_by=order_by,
2137            query=query,
2138            export_header=export_header,
2139        )
2140
2141        # Remove
2142        remove_if_exists(tmp_to_remove)
2143
2144        return (os.path.exists(output_file) or None) and (
2145            os.path.exists(output_file) or None
2146        )

The export_output function exports data from a VCF file to a specified output file in various formats, including VCF, CSV, TSV, PSV, and Parquet.

Parameters
  • output_file: The output_file parameter is a string that specifies the name of the output file to be generated by the function. This is where the exported data will be saved
  • output_header: The output_header parameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as the output_file parameter, but with the extension "
  • export_header: The export_header parameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. If export_header is True, the header will be exported to a file. If export_header is False, the header will not be, defaults to True, if output format is not VCF
  • query: The query parameter is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported
  • parquet_partitions: The parquet_partitions parameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets
  • chunk_size: The chunk_size parameter specifies the number of records in batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files.
  • threads: The threads parameter is an optional parameter that specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If not provided, the function will use the default number of threads
  • sort: The sort parameter is a boolean flag that determines whether the output file should be sorted or not. If sort is set to True, the output file will be sorted based on the genomic coordinates of the variants. By default, the value of sort is False, defaults to False
  • index: The index parameter is a boolean flag that determines whether an index should be created on the output file. If index is True, an index will be created. If index is False, no index will be created. The default value is False, defaults to False
  • order_by: The order_by parameter is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format
Returns

a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.

def get_extra_infos(self, table: str = None) -> list:
2148    def get_extra_infos(self, table: str = None) -> list:
2149        """
2150        The `get_extra_infos` function returns a list of columns that are in a specified table but not
2151        in the header.
2152
2153        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
2154        name of the table from which you want to retrieve the extra columns that are not present in the
2155        header. If the `table` parameter is not provided when calling the function, it will default to
2156        using the variants
2157        :type table: str
2158        :return: A list of columns that are in the specified table but not in the header of the table.
2159        """
2160
2161        header_columns = []
2162
2163        if not table:
2164            table = self.get_table_variants(clause="from")
2165            header_columns = self.get_header_columns()
2166
2167        # Check all columns in the database
2168        query = f""" SELECT * FROM {table} LIMIT 1 """
2169        log.debug(f"query {query}")
2170        table_columns = self.get_query_to_df(query).columns.tolist()
2171        extra_columns = []
2172
2173        # Construct extra infos (not in header)
2174        for column in table_columns:
2175            if column not in header_columns:
2176                extra_columns.append(column)
2177
2178        return extra_columns

The get_extra_infos function returns a list of columns that are in a specified table but not in the header.

Parameters
  • table: The table parameter in the get_extra_infos function is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If the table parameter is not provided when calling the function, it will default to using the variants
Returns

A list of columns that are in the specified table but not in the header of the table.

def get_extra_infos_sql(self, table: str = None) -> str:
2180    def get_extra_infos_sql(self, table: str = None) -> str:
2181        """
2182        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2183        by double quotes
2184
2185        :param table: The name of the table to get the extra infos from. If None, the default table is
2186        used
2187        :type table: str
2188        :return: A string of the extra infos
2189        """
2190
2191        return ", ".join(
2192            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2193        )

It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes

Parameters
  • table: The name of the table to get the extra infos from. If None, the default table is used
Returns

A string of the extra infos

def export_header( self, header_name: str = None, output_file: str = None, output_file_ext: str = '.hdr', clean_header: bool = True, remove_chrom_line: bool = False) -> str:
2195    def export_header(
2196        self,
2197        header_name: str = None,
2198        output_file: str = None,
2199        output_file_ext: str = ".hdr",
2200        clean_header: bool = True,
2201        remove_chrom_line: bool = False,
2202    ) -> str:
2203        """
2204        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2205        specified options, and writes it to a new file.
2206
2207        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2208        this parameter is not specified, the header will be written to the output file
2209        :type header_name: str
2210        :param output_file: The `output_file` parameter in the `export_header` function is used to
2211        specify the name of the output file where the header will be written. If this parameter is not
2212        provided, the header will be written to a temporary file
2213        :type output_file: str
2214        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2215        string that represents the extension of the output header file. By default, it is set to ".hdr"
2216        if not specified by the user. This extension will be appended to the `output_file` name to
2217        create the final, defaults to .hdr
2218        :type output_file_ext: str (optional)
2219        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2220        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2221        `True`, the function will clean the header by modifying certain lines based on a specific
2222        pattern. If `clean_header`, defaults to True
2223        :type clean_header: bool (optional)
2224        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2225        boolean flag that determines whether the #CHROM line should be removed from the header before
2226        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2227        defaults to False
2228        :type remove_chrom_line: bool (optional)
2229        :return: The function `export_header` returns the name of the temporary header file that is
2230        created.
2231        """
2232
2233        if not header_name and not output_file:
2234            output_file = self.get_output()
2235
2236        if self.get_header():
2237
2238            # Get header object
2239            header_obj = self.get_header()
2240
2241            # Create database
2242            db_for_header = Database(database=self.get_input())
2243
2244            # Get real columns in the file
2245            db_header_columns = db_for_header.get_columns()
2246
2247            with tempfile.TemporaryDirectory() as tmpdir:
2248
2249                # Write header file
2250                header_file_tmp = os.path.join(tmpdir, "header")
2251                f = open(header_file_tmp, "w")
2252                vcf.Writer(f, header_obj)
2253                f.close()
2254
2255                # Replace #CHROM line with rel columns
2256                header_list = db_for_header.read_header_file(
2257                    header_file=header_file_tmp
2258                )
2259                header_list[-1] = "\t".join(db_header_columns)
2260
2261                # Remove CHROM line
2262                if remove_chrom_line:
2263                    header_list.pop()
2264
2265                # Clean header
2266                if clean_header:
2267                    header_list_clean = []
2268                    for head in header_list:
2269                        # Clean head for malformed header
2270                        head_clean = head
2271                        head_clean = re.subn(
2272                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2273                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2274                            head_clean,
2275                            2,
2276                        )[0]
2277                        # Write header
2278                        header_list_clean.append(head_clean)
2279                    header_list = header_list_clean
2280
2281            tmp_header_name = output_file + output_file_ext
2282
2283            f = open(tmp_header_name, "w")
2284            for line in header_list:
2285                f.write(line)
2286            f.close()
2287
2288        return tmp_header_name

The export_header function takes a VCF file, extracts the header, modifies it according to specified options, and writes it to a new file.

Parameters
  • header_name: The header_name parameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file
  • output_file: The output_file parameter in the export_header function is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file
  • output_file_ext: The output_file_ext parameter in the export_header function is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to the output_file name to create the final, defaults to .hdr
  • clean_header: The clean_header parameter in the export_header function is a boolean flag that determines whether the header should be cleaned or not. When clean_header is set to True, the function will clean the header by modifying certain lines based on a specific pattern. If clean_header, defaults to True
  • remove_chrom_line: The remove_chrom_line parameter in the export_header function is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set to True, the #CHROM line will be removed; if set to `, defaults to False
Returns

The function export_header returns the name of the temporary header file that is created.

def export_variant_vcf( self, vcf_file, remove_info: bool = False, add_samples: bool = True, list_samples: list = [], where_clause: str = '', index: bool = False, threads: int | None = None) -> bool | None:
2290    def export_variant_vcf(
2291        self,
2292        vcf_file,
2293        remove_info: bool = False,
2294        add_samples: bool = True,
2295        list_samples: list = [],
2296        where_clause: str = "",
2297        index: bool = False,
2298        threads: int | None = None,
2299    ) -> bool | None:
2300        """
2301        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2302        remove INFO field, add samples, and control compression and indexing.
2303
2304        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2305        written to. It is the output file that will contain the filtered VCF data based on the specified
2306        parameters
2307        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2308        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2309        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2310        in, defaults to False
2311        :type remove_info: bool (optional)
2312        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2313        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2314        If set to False, the samples will be removed. The default value is True, defaults to True
2315        :type add_samples: bool (optional)
2316        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2317        in the output VCF file. By default, all samples will be included. If you provide a list of
2318        samples, only those samples will be included in the output file
2319        :type list_samples: list
2320        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2321        determines whether or not to create an index for the output VCF file. If `index` is set to
2322        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2323        :type index: bool (optional)
2324        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2325        number of threads to use for exporting the VCF file. It determines how many parallel threads
2326        will be used during the export process. More threads can potentially speed up the export process
2327        by utilizing multiple cores of the processor. If
2328        :type threads: int | None
2329        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2330        method with various parameters including the output file, query, threads, sort flag, and index
2331        flag. The `export_output` method is responsible for exporting the VCF data based on the
2332        specified parameters and configurations provided in the `export_variant_vcf` function.
2333        """
2334
2335        # Config
2336        config = self.get_config()
2337
2338        # Extract VCF
2339        log.debug("Export VCF...")
2340
2341        # Table variants
2342        table_variants = self.get_table_variants()
2343
2344        # Threads
2345        if not threads:
2346            threads = self.get_threads()
2347
2348        # Info fields
2349        if remove_info:
2350            if not isinstance(remove_info, str):
2351                remove_info = "."
2352            info_field = f"""'{remove_info}' as INFO"""
2353        else:
2354            info_field = "INFO"
2355
2356        # Samples fields
2357        if add_samples:
2358            if not list_samples:
2359                list_samples = self.get_header_sample_list()
2360            if list_samples:
2361                samples_fields = " , FORMAT , " + " , ".join(list_samples)
2362            else:
2363                samples_fields = ""
2364            log.debug(f"samples_fields: {samples_fields}")
2365        else:
2366            samples_fields = ""
2367
2368        # Where clause
2369        if where_clause is None:
2370            where_clause = ""
2371
2372        # Variants
2373        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2374        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
2375        log.debug(f"sql_query_select={sql_query_select}")
2376
2377        return self.export_output(
2378            output_file=vcf_file,
2379            output_header=None,
2380            export_header=True,
2381            query=sql_query_select,
2382            parquet_partitions=None,
2383            chunk_size=config.get("chunk_size", None),
2384            threads=threads,
2385            sort=True,
2386            index=index,
2387            order_by=None,
2388        )

The export_variant_vcf function exports a VCF file with specified samples, allowing options to remove INFO field, add samples, and control compression and indexing.

Parameters
  • vcf_file: The vcf_file parameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters
  • remove_info: The remove_info parameter in the export_variant_vcf function is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set to True, the INFO field will be removed. If set to False, the INFO field will be included in, defaults to False
  • add_samples: The add_samples parameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True
  • list_samples: The list_samples parameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file
  • index: The index parameter in the export_variant_vcf function is a boolean flag that determines whether or not to create an index for the output VCF file. If index is set to True, the output VCF file will be indexed using tabix. If index, defaults to False
  • threads: The threads parameter in the export_variant_vcf function specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns

The export_variant_vcf function returns the result of calling the export_output method with various parameters including the output file, query, threads, sort flag, and index flag. The export_output method is responsible for exporting the VCF data based on the specified parameters and configurations provided in the export_variant_vcf function.

def run_commands(self, commands: list = [], threads: int = 1) -> None:
2390    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2391        """
2392        It takes a list of commands and runs them in parallel using the number of threads specified
2393
2394        :param commands: A list of commands to run
2395        :param threads: The number of threads to use, defaults to 1 (optional)
2396        """
2397
2398        run_parallel_commands(commands, threads)

It takes a list of commands and runs them in parallel using the number of threads specified

Parameters
  • commands: A list of commands to run
  • threads: The number of threads to use, defaults to 1 (optional)
def get_threads(self, default: int = 1) -> int:
2400    def get_threads(self, default: int = 1) -> int:
2401        """
2402        This function returns the number of threads to use for a job, with a default value of 1 if not
2403        specified.
2404
2405        :param default: The `default` parameter in the `get_threads` method is used to specify the
2406        default number of threads to use if no specific value is provided. If no value is provided for
2407        the `threads` parameter in the configuration or input parameters, the `default` value will be
2408        used, defaults to 1
2409        :type default: int (optional)
2410        :return: the number of threads to use for the current job.
2411        """
2412
2413        # Config
2414        config = self.get_config()
2415
2416        # Param
2417        param = self.get_param()
2418
2419        # Input threads
2420        input_thread = param.get("threads", config.get("threads", None))
2421
2422        # Check threads
2423        if not input_thread:
2424            threads = default
2425        elif int(input_thread) <= 0:
2426            threads = os.cpu_count()
2427        else:
2428            threads = int(input_thread)
2429        return threads

This function returns the number of threads to use for a job, with a default value of 1 if not specified.

Parameters
  • default: The default parameter in the get_threads method is used to specify the default number of threads to use if no specific value is provided. If no value is provided for the threads parameter in the configuration or input parameters, the default value will be used, defaults to 1
Returns

the number of threads to use for the current job.

def get_memory(self, default: str = None) -> str:
2431    def get_memory(self, default: str = None) -> str:
2432        """
2433        This function retrieves the memory value from parameters or configuration with a default value
2434        if not found.
2435
2436        :param default: The `get_memory` function takes in a default value as a string parameter. This
2437        default value is used as a fallback in case the `memory` parameter is not provided in the
2438        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2439        the function
2440        :type default: str
2441        :return: The `get_memory` function returns a string value representing the memory parameter. If
2442        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2443        return the default value provided as an argument to the function.
2444        """
2445
2446        # Config
2447        config = self.get_config()
2448
2449        # Param
2450        param = self.get_param()
2451
2452        # Input threads
2453        input_memory = param.get("memory", config.get("memory", None))
2454
2455        # Check threads
2456        if input_memory:
2457            memory = input_memory
2458        else:
2459            memory = default
2460
2461        return memory

This function retrieves the memory value from parameters or configuration with a default value if not found.

Parameters
  • default: The get_memory function takes in a default value as a string parameter. This default value is used as a fallback in case the memory parameter is not provided in the param dictionary or the config dictionary. If memory is not found in either dictionary, the function
Returns

The get_memory function returns a string value representing the memory parameter. If the input_memory is provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.

def update_from_vcf(self, vcf_file: str) -> None:
2463    def update_from_vcf(self, vcf_file: str) -> None:
2464        """
2465        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2466
2467        :param vcf_file: the path to the VCF file
2468        """
2469
2470        connexion_format = self.get_connexion_format()
2471
2472        if connexion_format in ["duckdb"]:
2473            self.update_from_vcf_duckdb(vcf_file)
2474        elif connexion_format in ["sqlite"]:
2475            self.update_from_vcf_sqlite(vcf_file)

If the database is duckdb, then use the parquet method, otherwise use the sqlite method

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2477    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2478        """
2479        It takes a VCF file and updates the INFO column of the variants table in the database with the
2480        INFO column of the VCF file
2481
2482        :param vcf_file: the path to the VCF file
2483        """
2484
2485        # varaints table
2486        table_variants = self.get_table_variants()
2487
2488        # Loading VCF into temporaire table
2489        skip = self.get_header_length(file=vcf_file)
2490        vcf_df = pd.read_csv(
2491            vcf_file,
2492            sep="\t",
2493            engine="c",
2494            skiprows=skip,
2495            header=0,
2496            low_memory=False,
2497        )
2498        sql_query_update = f"""
2499        UPDATE {table_variants} as table_variants
2500            SET INFO = concat(
2501                            CASE
2502                                WHEN INFO NOT IN ('', '.')
2503                                THEN INFO
2504                                ELSE ''
2505                            END,
2506                            (
2507                                SELECT 
2508                                    concat(
2509                                        CASE
2510                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2511                                            THEN ';'
2512                                            ELSE ''
2513                                        END
2514                                        ,
2515                                        CASE
2516                                            WHEN table_parquet.INFO NOT IN ('','.')
2517                                            THEN table_parquet.INFO
2518                                            ELSE ''
2519                                        END
2520                                    )
2521                                FROM vcf_df as table_parquet
2522                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2523                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2524                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2525                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2526                                        AND table_parquet.INFO NOT IN ('','.')
2527                            )
2528                        )
2529            ;
2530            """
2531        self.conn.execute(sql_query_update)

It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2533    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2534        """
2535        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2536        table, then updates the INFO column of the variants table with the INFO column of the temporary
2537        table
2538
2539        :param vcf_file: The path to the VCF file you want to update the database with
2540        """
2541
2542        # Create a temporary table for the VCF
2543        table_vcf = "tmp_vcf"
2544        sql_create = (
2545            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2546        )
2547        self.conn.execute(sql_create)
2548
2549        # Loading VCF into temporaire table
2550        vcf_df = pd.read_csv(
2551            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2552        )
2553        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2554        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2555
2556        # Update table 'variants' with VCF data
2557        # warning: CONCAT as || operator
2558        sql_query_update = f"""
2559            UPDATE variants as table_variants
2560            SET INFO = CASE
2561                            WHEN INFO NOT IN ('', '.')
2562                            THEN INFO
2563                            ELSE ''
2564                        END ||
2565                        (
2566                        SELECT 
2567                            CASE 
2568                                WHEN table_variants.INFO NOT IN ('','.') 
2569                                    AND table_vcf.INFO NOT IN ('','.')  
2570                                THEN ';' 
2571                                ELSE '' 
2572                            END || 
2573                            CASE 
2574                                WHEN table_vcf.INFO NOT IN ('','.') 
2575                                THEN table_vcf.INFO 
2576                                ELSE '' 
2577                            END
2578                        FROM {table_vcf} as table_vcf
2579                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2580                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2581                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2582                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2583                        )
2584        """
2585        self.conn.execute(sql_query_update)
2586
2587        # Drop temporary table
2588        sql_drop = f"DROP TABLE {table_vcf}"
2589        self.conn.execute(sql_drop)

It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table

Parameters
  • vcf_file: The path to the VCF file you want to update the database with
def drop_variants_table(self) -> None:
2591    def drop_variants_table(self) -> None:
2592        """
2593        > This function drops the variants table
2594        """
2595
2596        table_variants = self.get_table_variants()
2597        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2598        self.conn.execute(sql_table_variants)

This function drops the variants table

def set_variant_id(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2600    def set_variant_id(
2601        self, variant_id_column: str = "variant_id", force: bool = None
2602    ) -> str:
2603        """
2604        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2605        `#CHROM`, `POS`, `REF`, and `ALT` columns
2606
2607        :param variant_id_column: The name of the column to be created in the variants table, defaults
2608        to variant_id
2609        :type variant_id_column: str (optional)
2610        :param force: If True, the variant_id column will be created even if it already exists
2611        :type force: bool
2612        :return: The name of the column that contains the variant_id
2613        """
2614
2615        # Assembly
2616        assembly = self.get_param().get(
2617            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2618        )
2619
2620        # INFO/Tag prefix
2621        prefix = self.get_explode_infos_prefix()
2622
2623        # Explode INFO/SVTYPE
2624        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2625
2626        # variants table
2627        table_variants = self.get_table_variants()
2628
2629        # variant_id column
2630        if not variant_id_column:
2631            variant_id_column = "variant_id"
2632
2633        # Creta variant_id column
2634        if "variant_id" not in self.get_extra_infos() or force:
2635
2636            # Create column
2637            self.add_column(
2638                table_name=table_variants,
2639                column_name=variant_id_column,
2640                column_type="UBIGINT",
2641                default_value="0",
2642            )
2643
2644            # Update column
2645            self.conn.execute(
2646                f"""
2647                    UPDATE {table_variants}
2648                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2649                """
2650            )
2651
2652        # Remove added columns
2653        for added_column in added_columns:
2654            self.drop_column(column=added_column)
2655
2656        # return variant_id column name
2657        return variant_id_column

It adds a column to the variants table called variant_id and populates it with a hash of the #CHROM, POS, REF, and ALT columns

Parameters
  • variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
  • force: If True, the variant_id column will be created even if it already exists
Returns

The name of the column that contains the variant_id

def get_variant_id_column(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2659    def get_variant_id_column(
2660        self, variant_id_column: str = "variant_id", force: bool = None
2661    ) -> str:
2662        """
2663        This function returns the variant_id column name
2664
2665        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2666        defaults to variant_id
2667        :type variant_id_column: str (optional)
2668        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2669        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2670        if it is not already set, or if it is set
2671        :type force: bool
2672        :return: The variant_id column name.
2673        """
2674
2675        return self.set_variant_id(variant_id_column=variant_id_column, force=force)

This function returns the variant_id column name

Parameters
  • variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
  • force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns

The variant_id column name.

def scan_databases( self, database_formats: list = ['parquet'], database_releases: list = ['current']) -> dict:
2681    def scan_databases(
2682        self,
2683        database_formats: list = ["parquet"],
2684        database_releases: list = ["current"],
2685    ) -> dict:
2686        """
2687        The function `scan_databases` scans for available databases based on specified formats and
2688        releases.
2689
2690        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2691        of the databases to be scanned. In this case, the accepted format is "parquet"
2692        :type database_formats: list ["parquet"]
2693        :param database_releases: The `database_releases` parameter is a list that specifies the
2694        releases of the databases to be scanned. In the provided function, the default value for
2695        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2696        databases that are in the "current"
2697        :type database_releases: list
2698        :return: The function `scan_databases` returns a dictionary containing information about
2699        databases that match the specified formats and releases.
2700        """
2701
2702        # Config
2703        config = self.get_config()
2704
2705        # Param
2706        param = self.get_param()
2707
2708        # Param - Assembly
2709        assembly = param.get("assembly", config.get("assembly", None))
2710        if not assembly:
2711            assembly = DEFAULT_ASSEMBLY
2712            log.warning(f"Default assembly '{assembly}'")
2713
2714        # Scan for availabled databases
2715        log.info(
2716            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2717        )
2718        databases_infos_dict = databases_infos(
2719            database_folder_releases=database_releases,
2720            database_formats=database_formats,
2721            assembly=assembly,
2722            config=config,
2723        )
2724        log.info(
2725            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2726        )
2727
2728        return databases_infos_dict

The function scan_databases scans for available databases based on specified formats and releases.

Parameters
  • database_formats: The database_formats parameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet"
  • database_releases: The database_releases parameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value for database_releases is set to ["current"], meaning that by default, the function will scan databases that are in the "current"
Returns

The function scan_databases returns a dictionary containing information about databases that match the specified formats and releases.

def annotation(self) -> None:
2730    def annotation(self) -> None:
2731        """
2732        It annotates the VCF file with the annotations specified in the config file.
2733        """
2734
2735        # Config
2736        config = self.get_config()
2737
2738        # Param
2739        param = self.get_param()
2740
2741        # Param - Assembly
2742        assembly = param.get("assembly", config.get("assembly", None))
2743        if not assembly:
2744            assembly = DEFAULT_ASSEMBLY
2745            log.warning(f"Default assembly '{assembly}'")
2746
2747        # annotations databases folders
2748        annotations_databases = set(
2749            config.get("folders", {})
2750            .get("databases", {})
2751            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2752            + config.get("folders", {})
2753            .get("databases", {})
2754            .get("parquet", ["~/howard/databases/parquet/current"])
2755            + config.get("folders", {})
2756            .get("databases", {})
2757            .get("bcftools", ["~/howard/databases/bcftools/current"])
2758        )
2759
2760        # Get param annotations
2761        if param.get("annotations", None) and isinstance(
2762            param.get("annotations", None), str
2763        ):
2764            log.debug(param.get("annotations", None))
2765            param_annotation_list = param.get("annotations").split(",")
2766        else:
2767            param_annotation_list = []
2768
2769        # Each tools param
2770        if param.get("annotation_parquet", None) != None:
2771            log.debug(
2772                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2773            )
2774            if isinstance(param.get("annotation_parquet", None), list):
2775                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2776            else:
2777                param_annotation_list.append(param.get("annotation_parquet"))
2778        if param.get("annotation_snpsift", None) != None:
2779            if isinstance(param.get("annotation_snpsift", None), list):
2780                param_annotation_list.append(
2781                    "snpsift:"
2782                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2783                )
2784            else:
2785                param_annotation_list.append(
2786                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2787                )
2788        if param.get("annotation_snpeff", None) != None:
2789            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2790        if param.get("annotation_bcftools", None) != None:
2791            if isinstance(param.get("annotation_bcftools", None), list):
2792                param_annotation_list.append(
2793                    "bcftools:"
2794                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2795                )
2796            else:
2797                param_annotation_list.append(
2798                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2799                )
2800        if param.get("annotation_annovar", None) != None:
2801            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2802        if param.get("annotation_exomiser", None) != None:
2803            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2804        if param.get("annotation_splice", None) != None:
2805            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2806
2807        # Merge param annotations list
2808        param["annotations"] = ",".join(param_annotation_list)
2809
2810        # debug
2811        log.debug(f"param_annotations={param['annotations']}")
2812
2813        if param.get("annotations"):
2814
2815            # Log
2816            # log.info("Annotations - Check annotation parameters")
2817
2818            if not "annotation" in param:
2819                param["annotation"] = {}
2820
2821            # List of annotations parameters
2822            annotations_list_input = {}
2823            if isinstance(param.get("annotations", None), str):
2824                annotation_file_list = [
2825                    value for value in param.get("annotations", "").split(",")
2826                ]
2827                for annotation_file in annotation_file_list:
2828                    annotations_list_input[annotation_file] = {"INFO": None}
2829            else:
2830                annotations_list_input = param.get("annotations", {})
2831
2832            log.info(f"Quick Annotations:")
2833            for annotation_key in list(annotations_list_input.keys()):
2834                log.info(f"   {annotation_key}")
2835
2836            # List of annotations and associated fields
2837            annotations_list = {}
2838
2839            for annotation_file in annotations_list_input:
2840
2841                # Explode annotations if ALL
2842                if (
2843                    annotation_file.upper() == "ALL"
2844                    or annotation_file.upper().startswith("ALL:")
2845                ):
2846
2847                    # check ALL parameters (formats, releases)
2848                    annotation_file_split = annotation_file.split(":")
2849                    database_formats = "parquet"
2850                    database_releases = "current"
2851                    for annotation_file_option in annotation_file_split[1:]:
2852                        database_all_options_split = annotation_file_option.split("=")
2853                        if database_all_options_split[0] == "format":
2854                            database_formats = database_all_options_split[1].split("+")
2855                        if database_all_options_split[0] == "release":
2856                            database_releases = database_all_options_split[1].split("+")
2857
2858                    # Scan for availabled databases
2859                    databases_infos_dict = self.scan_databases(
2860                        database_formats=database_formats,
2861                        database_releases=database_releases,
2862                    )
2863
2864                    # Add found databases in annotation parameters
2865                    for database_infos in databases_infos_dict.keys():
2866                        annotations_list[database_infos] = {"INFO": None}
2867
2868                else:
2869                    annotations_list[annotation_file] = annotations_list_input[
2870                        annotation_file
2871                    ]
2872
2873            # Check each databases
2874            if len(annotations_list):
2875
2876                log.info(
2877                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
2878                )
2879
2880                for annotation_file in annotations_list:
2881
2882                    # Init
2883                    annotations = annotations_list.get(annotation_file, None)
2884
2885                    # Annotation snpEff
2886                    if annotation_file.startswith("snpeff"):
2887
2888                        log.debug(f"Quick Annotation snpEff")
2889
2890                        if "snpeff" not in param["annotation"]:
2891                            param["annotation"]["snpeff"] = {}
2892
2893                        if "options" not in param["annotation"]["snpeff"]:
2894                            param["annotation"]["snpeff"]["options"] = ""
2895
2896                        # snpEff options in annotations
2897                        param["annotation"]["snpeff"]["options"] = "".join(
2898                            annotation_file.split(":")[1:]
2899                        )
2900
2901                    # Annotation Annovar
2902                    elif annotation_file.startswith("annovar"):
2903
2904                        log.debug(f"Quick Annotation Annovar")
2905
2906                        if "annovar" not in param["annotation"]:
2907                            param["annotation"]["annovar"] = {}
2908
2909                        if "annotations" not in param["annotation"]["annovar"]:
2910                            param["annotation"]["annovar"]["annotations"] = {}
2911
2912                        # Options
2913                        annotation_file_split = annotation_file.split(":")
2914                        for annotation_file_annotation in annotation_file_split[1:]:
2915                            if annotation_file_annotation:
2916                                param["annotation"]["annovar"]["annotations"][
2917                                    annotation_file_annotation
2918                                ] = annotations
2919
2920                    # Annotation Exomiser
2921                    elif annotation_file.startswith("exomiser"):
2922
2923                        log.debug(f"Quick Annotation Exomiser")
2924
2925                        param["annotation"]["exomiser"] = params_string_to_dict(
2926                            annotation_file
2927                        )
2928
2929                    # Annotation Splice
2930                    elif annotation_file.startswith("splice"):
2931
2932                        log.debug(f"Quick Annotation Splice")
2933
2934                        param["annotation"]["splice"] = params_string_to_dict(
2935                            annotation_file
2936                        )
2937
2938                    # Annotation Parquet or BCFTOOLS
2939                    else:
2940
2941                        # Tools detection
2942                        if annotation_file.startswith("bcftools:"):
2943                            annotation_tool_initial = "bcftools"
2944                            annotation_file = ":".join(annotation_file.split(":")[1:])
2945                        elif annotation_file.startswith("snpsift:"):
2946                            annotation_tool_initial = "snpsift"
2947                            annotation_file = ":".join(annotation_file.split(":")[1:])
2948                        else:
2949                            annotation_tool_initial = None
2950
2951                        # list of files
2952                        annotation_file_list = annotation_file.replace("+", ":").split(
2953                            ":"
2954                        )
2955
2956                        for annotation_file in annotation_file_list:
2957
2958                            if annotation_file:
2959
2960                                # Annotation tool initial
2961                                annotation_tool = annotation_tool_initial
2962
2963                                # Find file
2964                                annotation_file_found = None
2965
2966                                # Expand user
2967                                annotation_file = full_path(annotation_file)
2968
2969                                if os.path.exists(annotation_file):
2970                                    annotation_file_found = annotation_file
2971
2972                                else:
2973                                    # Find within assembly folders
2974                                    for annotations_database in annotations_databases:
2975                                        found_files = find_all(
2976                                            annotation_file,
2977                                            os.path.join(
2978                                                annotations_database, assembly
2979                                            ),
2980                                        )
2981                                        if len(found_files) > 0:
2982                                            annotation_file_found = found_files[0]
2983                                            break
2984                                    if not annotation_file_found and not assembly:
2985                                        # Find within folders
2986                                        for (
2987                                            annotations_database
2988                                        ) in annotations_databases:
2989                                            found_files = find_all(
2990                                                annotation_file, annotations_database
2991                                            )
2992                                            if len(found_files) > 0:
2993                                                annotation_file_found = found_files[0]
2994                                                break
2995                                log.debug(
2996                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
2997                                )
2998
2999                                # Full path
3000                                annotation_file_found = full_path(annotation_file_found)
3001
3002                                if annotation_file_found:
3003
3004                                    database = Database(database=annotation_file_found)
3005                                    quick_annotation_format = database.get_format()
3006                                    quick_annotation_is_compressed = (
3007                                        database.is_compressed()
3008                                    )
3009                                    quick_annotation_is_indexed = os.path.exists(
3010                                        f"{annotation_file_found}.tbi"
3011                                    )
3012                                    bcftools_preference = False
3013
3014                                    # Check Annotation Tool
3015                                    if not annotation_tool:
3016                                        if (
3017                                            bcftools_preference
3018                                            and quick_annotation_format
3019                                            in ["vcf", "bed"]
3020                                            and quick_annotation_is_compressed
3021                                            and quick_annotation_is_indexed
3022                                        ):
3023                                            annotation_tool = "bcftools"
3024                                        elif quick_annotation_format in [
3025                                            "vcf",
3026                                            "bed",
3027                                            "tsv",
3028                                            "tsv",
3029                                            "csv",
3030                                            "json",
3031                                            "tbl",
3032                                            "parquet",
3033                                            "duckdb",
3034                                        ]:
3035                                            annotation_tool = "parquet"
3036                                        else:
3037                                            log.error(
3038                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3039                                            )
3040                                            raise ValueError(
3041                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3042                                            )
3043
3044                                    log.debug(
3045                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
3046                                    )
3047
3048                                    # Annotation Tool dispatch
3049                                    if annotation_tool:
3050                                        if annotation_tool not in param["annotation"]:
3051                                            param["annotation"][annotation_tool] = {}
3052                                        if (
3053                                            "annotations"
3054                                            not in param["annotation"][annotation_tool]
3055                                        ):
3056                                            param["annotation"][annotation_tool][
3057                                                "annotations"
3058                                            ] = {}
3059                                        param["annotation"][annotation_tool][
3060                                            "annotations"
3061                                        ][annotation_file_found] = annotations
3062
3063                                else:
3064                                    log.error(
3065                                        f"Quick Annotation File {annotation_file} does NOT exist"
3066                                    )
3067
3068                self.set_param(param)
3069
3070        if param.get("annotation", None):
3071            log.info("Annotations")
3072            if param.get("annotation", {}).get("parquet", None):
3073                log.info("Annotations 'parquet'...")
3074                self.annotation_parquet()
3075            if param.get("annotation", {}).get("bcftools", None):
3076                log.info("Annotations 'bcftools'...")
3077                self.annotation_bcftools()
3078            if param.get("annotation", {}).get("snpsift", None):
3079                log.info("Annotations 'snpsift'...")
3080                self.annotation_snpsift()
3081            if param.get("annotation", {}).get("annovar", None):
3082                log.info("Annotations 'annovar'...")
3083                self.annotation_annovar()
3084            if param.get("annotation", {}).get("snpeff", None):
3085                log.info("Annotations 'snpeff'...")
3086                self.annotation_snpeff()
3087            if param.get("annotation", {}).get("exomiser", None) is not None:
3088                log.info("Annotations 'exomiser'...")
3089                self.annotation_exomiser()
3090            if param.get("annotation", {}).get("splice", None) is not None:
3091                log.info("Annotations 'splice' ...")
3092                self.annotation_splice()
3093
3094        # Explode INFOS fields into table fields
3095        if self.get_explode_infos():
3096            self.explode_infos(
3097                prefix=self.get_explode_infos_prefix(),
3098                fields=self.get_explode_infos_fields(),
3099                force=True,
3100            )

It annotates the VCF file with the annotations specified in the config file.

def annotation_snpsift(self, threads: int = None) -> None:
3102    def annotation_snpsift(self, threads: int = None) -> None:
3103        """
3104        This function annotate with bcftools
3105
3106        :param threads: Number of threads to use
3107        :return: the value of the variable "return_value".
3108        """
3109
3110        # DEBUG
3111        log.debug("Start annotation with bcftools databases")
3112
3113        # Threads
3114        if not threads:
3115            threads = self.get_threads()
3116        log.debug("Threads: " + str(threads))
3117
3118        # Config
3119        config = self.get_config()
3120        log.debug("Config: " + str(config))
3121
3122        # Config - snpSift
3123        snpsift_bin_command = get_bin_command(
3124            bin="SnpSift.jar",
3125            tool="snpsift",
3126            bin_type="jar",
3127            config=config,
3128            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3129        )
3130        if not snpsift_bin_command:
3131            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3132            log.error(msg_err)
3133            raise ValueError(msg_err)
3134
3135        # Config - bcftools
3136        bcftools_bin_command = get_bin_command(
3137            bin="bcftools",
3138            tool="bcftools",
3139            bin_type="bin",
3140            config=config,
3141            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3142        )
3143        if not bcftools_bin_command:
3144            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3145            log.error(msg_err)
3146            raise ValueError(msg_err)
3147
3148        # Config - BCFTools databases folders
3149        databases_folders = set(
3150            self.get_config()
3151            .get("folders", {})
3152            .get("databases", {})
3153            .get("annotations", ["."])
3154            + self.get_config()
3155            .get("folders", {})
3156            .get("databases", {})
3157            .get("bcftools", ["."])
3158        )
3159        log.debug("Databases annotations: " + str(databases_folders))
3160
3161        # Param
3162        annotations = (
3163            self.get_param()
3164            .get("annotation", {})
3165            .get("snpsift", {})
3166            .get("annotations", None)
3167        )
3168        log.debug("Annotations: " + str(annotations))
3169
3170        # Assembly
3171        assembly = self.get_param().get(
3172            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3173        )
3174
3175        # Data
3176        table_variants = self.get_table_variants()
3177
3178        # Check if not empty
3179        log.debug("Check if not empty")
3180        sql_query_chromosomes = (
3181            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3182        )
3183        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3184        if not sql_query_chromosomes_df["count"][0]:
3185            log.info(f"VCF empty")
3186            return
3187
3188        # VCF header
3189        vcf_reader = self.get_header()
3190        log.debug("Initial header: " + str(vcf_reader.infos))
3191
3192        # Existing annotations
3193        for vcf_annotation in self.get_header().infos:
3194
3195            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3196            log.debug(
3197                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3198            )
3199
3200        if annotations:
3201
3202            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3203
3204                # Export VCF file
3205                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3206
3207                # Init
3208                commands = {}
3209
3210                for annotation in annotations:
3211                    annotation_fields = annotations[annotation]
3212
3213                    # Annotation Name
3214                    annotation_name = os.path.basename(annotation)
3215
3216                    if not annotation_fields:
3217                        annotation_fields = {"INFO": None}
3218
3219                    log.debug(f"Annotation '{annotation_name}'")
3220                    log.debug(
3221                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3222                    )
3223
3224                    # Create Database
3225                    database = Database(
3226                        database=annotation,
3227                        databases_folders=databases_folders,
3228                        assembly=assembly,
3229                    )
3230
3231                    # Find files
3232                    db_file = database.get_database()
3233                    db_file = full_path(db_file)
3234                    db_hdr_file = database.get_header_file()
3235                    db_hdr_file = full_path(db_hdr_file)
3236                    db_file_type = database.get_format()
3237                    db_tbi_file = f"{db_file}.tbi"
3238                    db_file_compressed = database.is_compressed()
3239
3240                    # Check if compressed
3241                    if not db_file_compressed:
3242                        log.error(
3243                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3244                        )
3245                        raise ValueError(
3246                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3247                        )
3248
3249                    # Check if indexed
3250                    if not os.path.exists(db_tbi_file):
3251                        log.error(
3252                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3253                        )
3254                        raise ValueError(
3255                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3256                        )
3257
3258                    # Check index - try to create if not exists
3259                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3260                        log.error("Annotation failed: database not valid")
3261                        log.error(f"Annotation annotation file: {db_file}")
3262                        log.error(f"Annotation annotation header: {db_hdr_file}")
3263                        log.error(f"Annotation annotation index: {db_tbi_file}")
3264                        raise ValueError(
3265                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3266                        )
3267                    else:
3268
3269                        log.debug(
3270                            f"Annotation '{annotation}' - file: "
3271                            + str(db_file)
3272                            + " and "
3273                            + str(db_hdr_file)
3274                        )
3275
3276                        # Load header as VCF object
3277                        db_hdr_vcf = Variants(input=db_hdr_file)
3278                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3279                        log.debug(
3280                            "Annotation database header: "
3281                            + str(db_hdr_vcf_header_infos)
3282                        )
3283
3284                        # For all fields in database
3285                        annotation_fields_full = False
3286                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3287                            annotation_fields = {
3288                                key: key for key in db_hdr_vcf_header_infos
3289                            }
3290                            log.debug(
3291                                "Annotation database header - All annotations added: "
3292                                + str(annotation_fields)
3293                            )
3294                            annotation_fields_full = True
3295
3296                        # # Create file for field rename
3297                        # log.debug("Create file for field rename")
3298                        # tmp_rename = NamedTemporaryFile(
3299                        #     prefix=self.get_prefix(),
3300                        #     dir=self.get_tmp_dir(),
3301                        #     suffix=".rename",
3302                        #     delete=False,
3303                        # )
3304                        # tmp_rename_name = tmp_rename.name
3305                        # tmp_files.append(tmp_rename_name)
3306
3307                        # Number of fields
3308                        nb_annotation_field = 0
3309                        annotation_list = []
3310                        annotation_infos_rename_list = []
3311
3312                        for annotation_field in annotation_fields:
3313
3314                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3315                            annotation_fields_new_name = annotation_fields.get(
3316                                annotation_field, annotation_field
3317                            )
3318                            if not annotation_fields_new_name:
3319                                annotation_fields_new_name = annotation_field
3320
3321                            # Check if field is in DB and if field is not elready in input data
3322                            if (
3323                                annotation_field in db_hdr_vcf.get_header().infos
3324                                and annotation_fields_new_name
3325                                not in self.get_header().infos
3326                            ):
3327
3328                                log.info(
3329                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3330                                )
3331
3332                                # BCFTools annotate param to rename fields
3333                                if annotation_field != annotation_fields_new_name:
3334                                    annotation_infos_rename_list.append(
3335                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3336                                    )
3337
3338                                # Add INFO field to header
3339                                db_hdr_vcf_header_infos_number = (
3340                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3341                                )
3342                                db_hdr_vcf_header_infos_type = (
3343                                    db_hdr_vcf_header_infos[annotation_field].type
3344                                    or "String"
3345                                )
3346                                db_hdr_vcf_header_infos_description = (
3347                                    db_hdr_vcf_header_infos[annotation_field].desc
3348                                    or f"{annotation_field} description"
3349                                )
3350                                db_hdr_vcf_header_infos_source = (
3351                                    db_hdr_vcf_header_infos[annotation_field].source
3352                                    or "unknown"
3353                                )
3354                                db_hdr_vcf_header_infos_version = (
3355                                    db_hdr_vcf_header_infos[annotation_field].version
3356                                    or "unknown"
3357                                )
3358
3359                                vcf_reader.infos[annotation_fields_new_name] = (
3360                                    vcf.parser._Info(
3361                                        annotation_fields_new_name,
3362                                        db_hdr_vcf_header_infos_number,
3363                                        db_hdr_vcf_header_infos_type,
3364                                        db_hdr_vcf_header_infos_description,
3365                                        db_hdr_vcf_header_infos_source,
3366                                        db_hdr_vcf_header_infos_version,
3367                                        self.code_type_map[
3368                                            db_hdr_vcf_header_infos_type
3369                                        ],
3370                                    )
3371                                )
3372
3373                                annotation_list.append(annotation_field)
3374
3375                                nb_annotation_field += 1
3376
3377                            else:
3378
3379                                if (
3380                                    annotation_field
3381                                    not in db_hdr_vcf.get_header().infos
3382                                ):
3383                                    log.warning(
3384                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3385                                    )
3386                                if (
3387                                    annotation_fields_new_name
3388                                    in self.get_header().infos
3389                                ):
3390                                    log.warning(
3391                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3392                                    )
3393
3394                        log.info(
3395                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3396                        )
3397
3398                        annotation_infos = ",".join(annotation_list)
3399
3400                        if annotation_infos != "":
3401
3402                            # Annotated VCF (and error file)
3403                            tmp_annotation_vcf_name = os.path.join(
3404                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3405                            )
3406                            tmp_annotation_vcf_name_err = (
3407                                tmp_annotation_vcf_name + ".err"
3408                            )
3409
3410                            # Add fields to annotate
3411                            if not annotation_fields_full:
3412                                annotation_infos_option = f"-info {annotation_infos}"
3413                            else:
3414                                annotation_infos_option = ""
3415
3416                            # Info fields rename
3417                            if annotation_infos_rename_list:
3418                                annotation_infos_rename = " -c " + ",".join(
3419                                    annotation_infos_rename_list
3420                                )
3421                            else:
3422                                annotation_infos_rename = ""
3423
3424                            # Annotate command
3425                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3426
3427                            # Add command
3428                            commands[command_annotate] = tmp_annotation_vcf_name
3429
3430                if commands:
3431
3432                    # Export VCF file
3433                    self.export_variant_vcf(
3434                        vcf_file=tmp_vcf_name,
3435                        remove_info=True,
3436                        add_samples=False,
3437                        index=True,
3438                    )
3439                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3440
3441                    # Num command
3442                    nb_command = 0
3443
3444                    # Annotate
3445                    for command_annotate in commands:
3446                        nb_command += 1
3447                        log.info(
3448                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3449                        )
3450                        log.debug(f"command_annotate={command_annotate}")
3451                        run_parallel_commands([command_annotate], threads)
3452
3453                        # Debug
3454                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3455
3456                        # Update variants
3457                        log.info(
3458                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3459                        )
3460                        self.update_from_vcf(commands[command_annotate])

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_bcftools(self, threads: int = None) -> None:
3462    def annotation_bcftools(self, threads: int = None) -> None:
3463        """
3464        This function annotate with bcftools
3465
3466        :param threads: Number of threads to use
3467        :return: the value of the variable "return_value".
3468        """
3469
3470        # DEBUG
3471        log.debug("Start annotation with bcftools databases")
3472
3473        # Threads
3474        if not threads:
3475            threads = self.get_threads()
3476        log.debug("Threads: " + str(threads))
3477
3478        # Config
3479        config = self.get_config()
3480        log.debug("Config: " + str(config))
3481
3482        # DEBUG
3483        delete_tmp = True
3484        if self.get_config().get("verbosity", "warning") in ["debug"]:
3485            delete_tmp = False
3486            log.debug("Delete tmp files/folders: " + str(delete_tmp))
3487
3488        # Config - BCFTools bin command
3489        bcftools_bin_command = get_bin_command(
3490            bin="bcftools",
3491            tool="bcftools",
3492            bin_type="bin",
3493            config=config,
3494            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3495        )
3496        if not bcftools_bin_command:
3497            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3498            log.error(msg_err)
3499            raise ValueError(msg_err)
3500
3501        # Config - BCFTools databases folders
3502        databases_folders = set(
3503            self.get_config()
3504            .get("folders", {})
3505            .get("databases", {})
3506            .get("annotations", ["."])
3507            + self.get_config()
3508            .get("folders", {})
3509            .get("databases", {})
3510            .get("bcftools", ["."])
3511        )
3512        log.debug("Databases annotations: " + str(databases_folders))
3513
3514        # Param
3515        annotations = (
3516            self.get_param()
3517            .get("annotation", {})
3518            .get("bcftools", {})
3519            .get("annotations", None)
3520        )
3521        log.debug("Annotations: " + str(annotations))
3522
3523        # Assembly
3524        assembly = self.get_param().get(
3525            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3526        )
3527
3528        # Data
3529        table_variants = self.get_table_variants()
3530
3531        # Check if not empty
3532        log.debug("Check if not empty")
3533        sql_query_chromosomes = (
3534            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3535        )
3536        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3537        if not sql_query_chromosomes_df["count"][0]:
3538            log.info(f"VCF empty")
3539            return
3540
3541        # Export in VCF
3542        log.debug("Create initial file to annotate")
3543        tmp_vcf = NamedTemporaryFile(
3544            prefix=self.get_prefix(),
3545            dir=self.get_tmp_dir(),
3546            suffix=".vcf.gz",
3547            delete=False,
3548        )
3549        tmp_vcf_name = tmp_vcf.name
3550
3551        # VCF header
3552        vcf_reader = self.get_header()
3553        log.debug("Initial header: " + str(vcf_reader.infos))
3554
3555        # Existing annotations
3556        for vcf_annotation in self.get_header().infos:
3557
3558            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3559            log.debug(
3560                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3561            )
3562
3563        if annotations:
3564
3565            tmp_ann_vcf_list = []
3566            commands = []
3567            tmp_files = []
3568            err_files = []
3569
3570            for annotation in annotations:
3571                annotation_fields = annotations[annotation]
3572
3573                # Annotation Name
3574                annotation_name = os.path.basename(annotation)
3575
3576                if not annotation_fields:
3577                    annotation_fields = {"INFO": None}
3578
3579                log.debug(f"Annotation '{annotation_name}'")
3580                log.debug(
3581                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3582                )
3583
3584                # Create Database
3585                database = Database(
3586                    database=annotation,
3587                    databases_folders=databases_folders,
3588                    assembly=assembly,
3589                )
3590
3591                # Find files
3592                db_file = database.get_database()
3593                db_file = full_path(db_file)
3594                db_hdr_file = database.get_header_file()
3595                db_hdr_file = full_path(db_hdr_file)
3596                db_file_type = database.get_format()
3597                db_tbi_file = f"{db_file}.tbi"
3598                db_file_compressed = database.is_compressed()
3599
3600                # Check if compressed
3601                if not db_file_compressed:
3602                    log.error(
3603                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3604                    )
3605                    raise ValueError(
3606                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3607                    )
3608
3609                # Check if indexed
3610                if not os.path.exists(db_tbi_file):
3611                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
3612                    raise ValueError(
3613                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
3614                    )
3615
3616                # Check index - try to create if not exists
3617                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3618                    log.error("Annotation failed: database not valid")
3619                    log.error(f"Annotation annotation file: {db_file}")
3620                    log.error(f"Annotation annotation header: {db_hdr_file}")
3621                    log.error(f"Annotation annotation index: {db_tbi_file}")
3622                    raise ValueError(
3623                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3624                    )
3625                else:
3626
3627                    log.debug(
3628                        f"Annotation '{annotation}' - file: "
3629                        + str(db_file)
3630                        + " and "
3631                        + str(db_hdr_file)
3632                    )
3633
3634                    # Load header as VCF object
3635                    db_hdr_vcf = Variants(input=db_hdr_file)
3636                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3637                    log.debug(
3638                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
3639                    )
3640
3641                    # For all fields in database
3642                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
3643                        annotation_fields = {
3644                            key: key for key in db_hdr_vcf_header_infos
3645                        }
3646                        log.debug(
3647                            "Annotation database header - All annotations added: "
3648                            + str(annotation_fields)
3649                        )
3650
3651                    # Number of fields
3652                    nb_annotation_field = 0
3653                    annotation_list = []
3654
3655                    for annotation_field in annotation_fields:
3656
3657                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3658                        annotation_fields_new_name = annotation_fields.get(
3659                            annotation_field, annotation_field
3660                        )
3661                        if not annotation_fields_new_name:
3662                            annotation_fields_new_name = annotation_field
3663
3664                        # Check if field is in DB and if field is not elready in input data
3665                        if (
3666                            annotation_field in db_hdr_vcf.get_header().infos
3667                            and annotation_fields_new_name
3668                            not in self.get_header().infos
3669                        ):
3670
3671                            log.info(
3672                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3673                            )
3674
3675                            # Add INFO field to header
3676                            db_hdr_vcf_header_infos_number = (
3677                                db_hdr_vcf_header_infos[annotation_field].num or "."
3678                            )
3679                            db_hdr_vcf_header_infos_type = (
3680                                db_hdr_vcf_header_infos[annotation_field].type
3681                                or "String"
3682                            )
3683                            db_hdr_vcf_header_infos_description = (
3684                                db_hdr_vcf_header_infos[annotation_field].desc
3685                                or f"{annotation_field} description"
3686                            )
3687                            db_hdr_vcf_header_infos_source = (
3688                                db_hdr_vcf_header_infos[annotation_field].source
3689                                or "unknown"
3690                            )
3691                            db_hdr_vcf_header_infos_version = (
3692                                db_hdr_vcf_header_infos[annotation_field].version
3693                                or "unknown"
3694                            )
3695
3696                            vcf_reader.infos[annotation_fields_new_name] = (
3697                                vcf.parser._Info(
3698                                    annotation_fields_new_name,
3699                                    db_hdr_vcf_header_infos_number,
3700                                    db_hdr_vcf_header_infos_type,
3701                                    db_hdr_vcf_header_infos_description,
3702                                    db_hdr_vcf_header_infos_source,
3703                                    db_hdr_vcf_header_infos_version,
3704                                    self.code_type_map[db_hdr_vcf_header_infos_type],
3705                                )
3706                            )
3707
3708                            # annotation_list.append(annotation_field)
3709                            if annotation_field != annotation_fields_new_name:
3710                                annotation_list.append(
3711                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3712                                )
3713                            else:
3714                                annotation_list.append(annotation_field)
3715
3716                            nb_annotation_field += 1
3717
3718                        else:
3719
3720                            if annotation_field not in db_hdr_vcf.get_header().infos:
3721                                log.warning(
3722                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
3723                                )
3724                            if annotation_fields_new_name in self.get_header().infos:
3725                                log.warning(
3726                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
3727                                )
3728
3729                    log.info(
3730                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3731                    )
3732
3733                    annotation_infos = ",".join(annotation_list)
3734
3735                    if annotation_infos != "":
3736
3737                        # Protect header for bcftools (remove "#CHROM" and variants line)
3738                        log.debug("Protect Header file - remove #CHROM line if exists")
3739                        tmp_header_vcf = NamedTemporaryFile(
3740                            prefix=self.get_prefix(),
3741                            dir=self.get_tmp_dir(),
3742                            suffix=".hdr",
3743                            delete=False,
3744                        )
3745                        tmp_header_vcf_name = tmp_header_vcf.name
3746                        tmp_files.append(tmp_header_vcf_name)
3747                        # Command
3748                        if db_hdr_file.endswith(".gz"):
3749                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3750                        else:
3751                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3752                        # Run
3753                        run_parallel_commands([command_extract_header], 1)
3754
3755                        # Find chomosomes
3756                        log.debug("Find chromosomes ")
3757                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
3758                        sql_query_chromosomes_df = self.get_query_to_df(
3759                            sql_query_chromosomes
3760                        )
3761                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
3762
3763                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
3764
3765                        # BED columns in the annotation file
3766                        if db_file_type in ["bed"]:
3767                            annotation_infos = "CHROM,POS,POS," + annotation_infos
3768
3769                        for chrom in chomosomes_list:
3770
3771                            # Create BED on initial VCF
3772                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
3773                            tmp_bed = NamedTemporaryFile(
3774                                prefix=self.get_prefix(),
3775                                dir=self.get_tmp_dir(),
3776                                suffix=".bed",
3777                                delete=False,
3778                            )
3779                            tmp_bed_name = tmp_bed.name
3780                            tmp_files.append(tmp_bed_name)
3781
3782                            # Detecte regions
3783                            log.debug(
3784                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
3785                            )
3786                            window = 1000000
3787                            sql_query_intervals_for_bed = f"""
3788                                SELECT  \"#CHROM\",
3789                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
3790                                        \"POS\"+{window}
3791                                FROM {table_variants} as table_variants
3792                                WHERE table_variants.\"#CHROM\" = '{chrom}'
3793                            """
3794                            regions = self.conn.execute(
3795                                sql_query_intervals_for_bed
3796                            ).fetchall()
3797                            merged_regions = merge_regions(regions)
3798                            log.debug(
3799                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
3800                            )
3801
3802                            header = ["#CHROM", "START", "END"]
3803                            with open(tmp_bed_name, "w") as f:
3804                                # Write the header with tab delimiter
3805                                f.write("\t".join(header) + "\n")
3806                                for d in merged_regions:
3807                                    # Write each data row with tab delimiter
3808                                    f.write("\t".join(map(str, d)) + "\n")
3809
3810                            # Tmp files
3811                            tmp_annotation_vcf = NamedTemporaryFile(
3812                                prefix=self.get_prefix(),
3813                                dir=self.get_tmp_dir(),
3814                                suffix=".vcf.gz",
3815                                delete=False,
3816                            )
3817                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
3818                            tmp_files.append(tmp_annotation_vcf_name)
3819                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
3820                            tmp_annotation_vcf_name_err = (
3821                                tmp_annotation_vcf_name + ".err"
3822                            )
3823                            err_files.append(tmp_annotation_vcf_name_err)
3824
3825                            # Annotate Command
3826                            log.debug(
3827                                f"Annotation '{annotation}' - add bcftools command"
3828                            )
3829
3830                            # Command
3831                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3832
3833                            # Add command
3834                            commands.append(command_annotate)
3835
3836            # if some commands
3837            if commands:
3838
3839                # Export VCF file
3840                self.export_variant_vcf(
3841                    vcf_file=tmp_vcf_name,
3842                    remove_info=True,
3843                    add_samples=False,
3844                    index=True,
3845                )
3846
3847                # Threads
3848                # calculate threads for annotated commands
3849                if commands:
3850                    threads_bcftools_annotate = round(threads / len(commands))
3851                else:
3852                    threads_bcftools_annotate = 1
3853
3854                if not threads_bcftools_annotate:
3855                    threads_bcftools_annotate = 1
3856
3857                # Add threads option to bcftools commands
3858                if threads_bcftools_annotate > 1:
3859                    commands_threaded = []
3860                    for command in commands:
3861                        commands_threaded.append(
3862                            command.replace(
3863                                f"{bcftools_bin_command} annotate ",
3864                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
3865                            )
3866                        )
3867                    commands = commands_threaded
3868
3869                # Command annotation multithreading
3870                log.debug(f"Annotation - Annotation commands: " + str(commands))
3871                log.info(
3872                    f"Annotation - Annotation multithreaded in "
3873                    + str(len(commands))
3874                    + " commands"
3875                )
3876
3877                run_parallel_commands(commands, threads)
3878
3879                # Merge
3880                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
3881
3882                if tmp_ann_vcf_list_cmd:
3883
3884                    # Tmp file
3885                    tmp_annotate_vcf = NamedTemporaryFile(
3886                        prefix=self.get_prefix(),
3887                        dir=self.get_tmp_dir(),
3888                        suffix=".vcf.gz",
3889                        delete=True,
3890                    )
3891                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
3892                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
3893                    err_files.append(tmp_annotate_vcf_name_err)
3894
3895                    # Tmp file remove command
3896                    tmp_files_remove_command = ""
3897                    if tmp_files:
3898                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
3899
3900                    # Command merge
3901                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
3902                    log.info(
3903                        f"Annotation - Annotation merging "
3904                        + str(len(commands))
3905                        + " annotated files"
3906                    )
3907                    log.debug(f"Annotation - merge command: {merge_command}")
3908                    run_parallel_commands([merge_command], 1)
3909
3910                    # Error messages
3911                    log.info(f"Error/Warning messages:")
3912                    error_message_command_all = []
3913                    error_message_command_warning = []
3914                    error_message_command_err = []
3915                    for err_file in err_files:
3916                        with open(err_file, "r") as f:
3917                            for line in f:
3918                                message = line.strip()
3919                                error_message_command_all.append(message)
3920                                if line.startswith("[W::"):
3921                                    error_message_command_warning.append(message)
3922                                if line.startswith("[E::"):
3923                                    error_message_command_err.append(
3924                                        f"{err_file}: " + message
3925                                    )
3926                    # log info
3927                    for message in list(
3928                        set(error_message_command_err + error_message_command_warning)
3929                    ):
3930                        log.info(f"   {message}")
3931                    # debug info
3932                    for message in list(set(error_message_command_all)):
3933                        log.debug(f"   {message}")
3934                    # failed
3935                    if len(error_message_command_err):
3936                        log.error("Annotation failed: Error in commands")
3937                        raise ValueError("Annotation failed: Error in commands")
3938
3939                    # Update variants
3940                    log.info(f"Annotation - Updating...")
3941                    self.update_from_vcf(tmp_annotate_vcf_name)

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_exomiser(self, threads: int = None) -> None:
3943    def annotation_exomiser(self, threads: int = None) -> None:
3944        """
3945        This function annotate with Exomiser
3946
3947        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
3948        - "analysis" (dict/file):
3949            Full analysis dictionnary parameters (see Exomiser docs).
3950            Either a dict, or a file in JSON or YAML format.
3951            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
3952            Default : None
3953        - "preset" (string):
3954            Analysis preset (available in config folder).
3955            Used if no full "analysis" is provided.
3956            Default: "exome"
3957        - "phenopacket" (dict/file):
3958            Samples and phenotipic features parameters (see Exomiser docs).
3959            Either a dict, or a file in JSON or YAML format.
3960            Default: None
3961        - "subject" (dict):
3962            Sample parameters (see Exomiser docs).
3963            Example:
3964                "subject":
3965                    {
3966                        "id": "ISDBM322017",
3967                        "sex": "FEMALE"
3968                    }
3969            Default: None
3970        - "sample" (string):
3971            Sample name to construct "subject" section:
3972                "subject":
3973                    {
3974                        "id": "<sample>",
3975                        "sex": "UNKNOWN_SEX"
3976                    }
3977            Default: None
3978        - "phenotypicFeatures" (dict)
3979            Phenotypic features to construct "subject" section.
3980            Example:
3981                "phenotypicFeatures":
3982                    [
3983                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
3984                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
3985                    ]
3986        - "hpo" (list)
3987            List of HPO ids as phenotypic features.
3988            Example:
3989                "hpo": ['0001156', '0001363', '0011304', '0010055']
3990            Default: []
3991        - "outputOptions" (dict):
3992            Output options (see Exomiser docs).
3993            Default:
3994                "output_options" =
3995                    {
3996                        "outputContributingVariantsOnly": False,
3997                        "numGenes": 0,
3998                        "outputFormats": ["TSV_VARIANT", "VCF"]
3999                    }
4000        - "transcript_source" (string):
4001            Transcript source (either "refseq", "ucsc", "ensembl")
4002            Default: "refseq"
4003        - "exomiser_to_info" (boolean):
4004            Add exomiser TSV file columns as INFO fields in VCF.
4005            Default: False
4006        - "release" (string):
4007            Exomise database release.
4008            If not exists, database release will be downloaded (take a while).
4009            Default: None (provided by application.properties configuration file)
4010        - "exomiser_application_properties" (file):
4011            Exomiser configuration file (see Exomiser docs).
4012            Useful to automatically download databases (especially for specific genome databases).
4013
4014        Notes:
4015        - If no sample in parameters, first sample in VCF will be chosen
4016        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
4017
4018        :param threads: The number of threads to use
4019        :return: None.
4020        """
4021
4022        # DEBUG
4023        log.debug("Start annotation with Exomiser databases")
4024
4025        # Threads
4026        if not threads:
4027            threads = self.get_threads()
4028        log.debug("Threads: " + str(threads))
4029
4030        # Config
4031        config = self.get_config()
4032        log.debug("Config: " + str(config))
4033
4034        # Config - Folders - Databases
4035        databases_folders = (
4036            config.get("folders", {})
4037            .get("databases", {})
4038            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
4039        )
4040        databases_folders = full_path(databases_folders)
4041        if not os.path.exists(databases_folders):
4042            log.error(f"Databases annotations: {databases_folders} NOT found")
4043        log.debug("Databases annotations: " + str(databases_folders))
4044
4045        # Config - Exomiser
4046        exomiser_bin_command = get_bin_command(
4047            bin="exomiser-cli*.jar",
4048            tool="exomiser",
4049            bin_type="jar",
4050            config=config,
4051            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
4052        )
4053        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
4054        if not exomiser_bin_command:
4055            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
4056            log.error(msg_err)
4057            raise ValueError(msg_err)
4058
4059        # Param
4060        param = self.get_param()
4061        log.debug("Param: " + str(param))
4062
4063        # Param - Exomiser
4064        param_exomiser = param.get("annotation", {}).get("exomiser", {})
4065        log.debug(f"Param Exomiser: {param_exomiser}")
4066
4067        # Param - Assembly
4068        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4069        log.debug("Assembly: " + str(assembly))
4070
4071        # Data
4072        table_variants = self.get_table_variants()
4073
4074        # Check if not empty
4075        log.debug("Check if not empty")
4076        sql_query_chromosomes = (
4077            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4078        )
4079        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4080            log.info(f"VCF empty")
4081            return False
4082
4083        # VCF header
4084        vcf_reader = self.get_header()
4085        log.debug("Initial header: " + str(vcf_reader.infos))
4086
4087        # Samples
4088        samples = self.get_header_sample_list()
4089        if not samples:
4090            log.error("No Samples in VCF")
4091            return False
4092        log.debug(f"Samples: {samples}")
4093
4094        # Memory limit
4095        memory_limit = self.get_memory("8G")
4096        log.debug(f"memory_limit: {memory_limit}")
4097
4098        # Exomiser java options
4099        exomiser_java_options = (
4100            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4101        )
4102        log.debug(f"Exomiser java options: {exomiser_java_options}")
4103
4104        # Download Exomiser (if not exists)
4105        exomiser_release = param_exomiser.get("release", None)
4106        exomiser_application_properties = param_exomiser.get(
4107            "exomiser_application_properties", None
4108        )
4109        databases_download_exomiser(
4110            assemblies=[assembly],
4111            exomiser_folder=databases_folders,
4112            exomiser_release=exomiser_release,
4113            exomiser_phenotype_release=exomiser_release,
4114            exomiser_application_properties=exomiser_application_properties,
4115        )
4116
4117        # Force annotation
4118        force_update_annotation = True
4119
4120        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4121            log.debug("Start annotation Exomiser")
4122
4123            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4124
4125                # tmp_dir = "/tmp/exomiser"
4126
4127                ### ANALYSIS ###
4128                ################
4129
4130                # Create analysis.json through analysis dict
4131                # either analysis in param or by default
4132                # depending on preset exome/genome)
4133
4134                # Init analysis dict
4135                param_exomiser_analysis_dict = {}
4136
4137                # analysis from param
4138                param_exomiser_analysis = param_exomiser.get("analysis", {})
4139                param_exomiser_analysis = full_path(param_exomiser_analysis)
4140
4141                # If analysis in param -> load anlaysis json
4142                if param_exomiser_analysis:
4143
4144                    # If param analysis is a file and exists
4145                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4146                        param_exomiser_analysis
4147                    ):
4148                        # Load analysis file into analysis dict (either yaml or json)
4149                        with open(param_exomiser_analysis) as json_file:
4150                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4151
4152                    # If param analysis is a dict
4153                    elif isinstance(param_exomiser_analysis, dict):
4154                        # Load analysis dict into analysis dict (either yaml or json)
4155                        param_exomiser_analysis_dict = param_exomiser_analysis
4156
4157                    # Error analysis type
4158                    else:
4159                        log.error(f"Analysis type unknown. Check param file.")
4160                        raise ValueError(f"Analysis type unknown. Check param file.")
4161
4162                # Case no input analysis config file/dict
4163                # Use preset (exome/genome) to open default config file
4164                if not param_exomiser_analysis_dict:
4165
4166                    # default preset
4167                    default_preset = "exome"
4168
4169                    # Get param preset or default preset
4170                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4171
4172                    # Try to find if preset is a file
4173                    if os.path.exists(param_exomiser_preset):
4174                        # Preset file is provided in full path
4175                        param_exomiser_analysis_default_config_file = (
4176                            param_exomiser_preset
4177                        )
4178                    # elif os.path.exists(full_path(param_exomiser_preset)):
4179                    #     # Preset file is provided in full path
4180                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4181                    elif os.path.exists(
4182                        os.path.join(folder_config, param_exomiser_preset)
4183                    ):
4184                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4185                        param_exomiser_analysis_default_config_file = os.path.join(
4186                            folder_config, param_exomiser_preset
4187                        )
4188                    else:
4189                        # Construct preset file
4190                        param_exomiser_analysis_default_config_file = os.path.join(
4191                            folder_config,
4192                            f"preset-{param_exomiser_preset}-analysis.json",
4193                        )
4194
4195                    # If preset file exists
4196                    param_exomiser_analysis_default_config_file = full_path(
4197                        param_exomiser_analysis_default_config_file
4198                    )
4199                    if os.path.exists(param_exomiser_analysis_default_config_file):
4200                        # Load prest file into analysis dict (either yaml or json)
4201                        with open(
4202                            param_exomiser_analysis_default_config_file
4203                        ) as json_file:
4204                            # param_exomiser_analysis_dict[""] = json.load(json_file)
4205                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4206                                json_file
4207                            )
4208
4209                    # Error preset file
4210                    else:
4211                        log.error(
4212                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4213                        )
4214                        raise ValueError(
4215                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4216                        )
4217
4218                # If no analysis dict created
4219                if not param_exomiser_analysis_dict:
4220                    log.error(f"No analysis config")
4221                    raise ValueError(f"No analysis config")
4222
4223                # Log
4224                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4225
4226                ### PHENOPACKET ###
4227                ###################
4228
4229                # If no PhenoPacket in analysis dict -> check in param
4230                if "phenopacket" not in param_exomiser_analysis_dict:
4231
4232                    # If PhenoPacket in param -> load anlaysis json
4233                    if param_exomiser.get("phenopacket", None):
4234
4235                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4236                        param_exomiser_phenopacket = full_path(
4237                            param_exomiser_phenopacket
4238                        )
4239
4240                        # If param phenopacket is a file and exists
4241                        if isinstance(
4242                            param_exomiser_phenopacket, str
4243                        ) and os.path.exists(param_exomiser_phenopacket):
4244                            # Load phenopacket file into analysis dict (either yaml or json)
4245                            with open(param_exomiser_phenopacket) as json_file:
4246                                param_exomiser_analysis_dict["phenopacket"] = (
4247                                    yaml.safe_load(json_file)
4248                                )
4249
4250                        # If param phenopacket is a dict
4251                        elif isinstance(param_exomiser_phenopacket, dict):
4252                            # Load phenopacket dict into analysis dict (either yaml or json)
4253                            param_exomiser_analysis_dict["phenopacket"] = (
4254                                param_exomiser_phenopacket
4255                            )
4256
4257                        # Error phenopacket type
4258                        else:
4259                            log.error(f"Phenopacket type unknown. Check param file.")
4260                            raise ValueError(
4261                                f"Phenopacket type unknown. Check param file."
4262                            )
4263
4264                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4265                if "phenopacket" not in param_exomiser_analysis_dict:
4266
4267                    # Init PhenoPacket
4268                    param_exomiser_analysis_dict["phenopacket"] = {
4269                        "id": "analysis",
4270                        "proband": {},
4271                    }
4272
4273                    ### Add subject ###
4274
4275                    # If subject exists
4276                    param_exomiser_subject = param_exomiser.get("subject", {})
4277
4278                    # If subject not exists -> found sample ID
4279                    if not param_exomiser_subject:
4280
4281                        # Found sample ID in param
4282                        sample = param_exomiser.get("sample", None)
4283
4284                        # Find sample ID (first sample)
4285                        if not sample:
4286                            sample_list = self.get_header_sample_list()
4287                            if len(sample_list) > 0:
4288                                sample = sample_list[0]
4289                            else:
4290                                log.error(f"No sample found")
4291                                raise ValueError(f"No sample found")
4292
4293                        # Create subject
4294                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4295
4296                    # Add to dict
4297                    param_exomiser_analysis_dict["phenopacket"][
4298                        "subject"
4299                    ] = param_exomiser_subject
4300
4301                    ### Add "phenotypicFeatures" ###
4302
4303                    # If phenotypicFeatures exists
4304                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4305                        "phenotypicFeatures", []
4306                    )
4307
4308                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4309                    if not param_exomiser_phenotypicfeatures:
4310
4311                        # Found HPO in param
4312                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4313
4314                        # Split HPO if list in string format separated by comma
4315                        if isinstance(param_exomiser_hpo, str):
4316                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4317
4318                        # Create HPO list
4319                        for hpo in param_exomiser_hpo:
4320                            hpo_clean = re.sub("[^0-9]", "", hpo)
4321                            param_exomiser_phenotypicfeatures.append(
4322                                {
4323                                    "type": {
4324                                        "id": f"HP:{hpo_clean}",
4325                                        "label": f"HP:{hpo_clean}",
4326                                    }
4327                                }
4328                            )
4329
4330                    # Add to dict
4331                    param_exomiser_analysis_dict["phenopacket"][
4332                        "phenotypicFeatures"
4333                    ] = param_exomiser_phenotypicfeatures
4334
4335                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4336                    if not param_exomiser_phenotypicfeatures:
4337                        for step in param_exomiser_analysis_dict.get(
4338                            "analysis", {}
4339                        ).get("steps", []):
4340                            if "hiPhivePrioritiser" in step:
4341                                param_exomiser_analysis_dict.get("analysis", {}).get(
4342                                    "steps", []
4343                                ).remove(step)
4344
4345                ### Add Input File ###
4346
4347                # Initial file name and htsFiles
4348                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4349                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4350                    {
4351                        "uri": tmp_vcf_name,
4352                        "htsFormat": "VCF",
4353                        "genomeAssembly": assembly,
4354                    }
4355                ]
4356
4357                ### Add metaData ###
4358
4359                # If metaData not in analysis dict
4360                if "metaData" not in param_exomiser_analysis_dict:
4361                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4362                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4363                        "createdBy": "howard",
4364                        "phenopacketSchemaVersion": 1,
4365                    }
4366
4367                ### OutputOptions ###
4368
4369                # Init output result folder
4370                output_results = os.path.join(tmp_dir, "results")
4371
4372                # If no outputOptions in analysis dict
4373                if "outputOptions" not in param_exomiser_analysis_dict:
4374
4375                    # default output formats
4376                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4377
4378                    # Get outputOptions in param
4379                    output_options = param_exomiser.get("outputOptions", None)
4380
4381                    # If no output_options in param -> check
4382                    if not output_options:
4383                        output_options = {
4384                            "outputContributingVariantsOnly": False,
4385                            "numGenes": 0,
4386                            "outputFormats": defaut_output_formats,
4387                        }
4388
4389                    # Replace outputDirectory in output options
4390                    output_options["outputDirectory"] = output_results
4391                    output_options["outputFileName"] = "howard"
4392
4393                    # Add outputOptions in analysis dict
4394                    param_exomiser_analysis_dict["outputOptions"] = output_options
4395
4396                else:
4397
4398                    # Replace output_results and output format (if exists in param)
4399                    param_exomiser_analysis_dict["outputOptions"][
4400                        "outputDirectory"
4401                    ] = output_results
4402                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4403                        list(
4404                            set(
4405                                param_exomiser_analysis_dict.get(
4406                                    "outputOptions", {}
4407                                ).get("outputFormats", [])
4408                                + ["TSV_VARIANT", "VCF"]
4409                            )
4410                        )
4411                    )
4412
4413                # log
4414                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4415
4416                ### ANALYSIS FILE ###
4417                #####################
4418
4419                ### Full JSON analysis config file ###
4420
4421                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4422                with open(exomiser_analysis, "w") as fp:
4423                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4424
4425                ### SPLIT analysis and sample config files
4426
4427                # Splitted analysis dict
4428                param_exomiser_analysis_dict_for_split = (
4429                    param_exomiser_analysis_dict.copy()
4430                )
4431
4432                # Phenopacket JSON file
4433                exomiser_analysis_phenopacket = os.path.join(
4434                    tmp_dir, "analysis_phenopacket.json"
4435                )
4436                with open(exomiser_analysis_phenopacket, "w") as fp:
4437                    json.dump(
4438                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4439                        fp,
4440                        indent=4,
4441                    )
4442
4443                # Analysis JSON file without Phenopacket parameters
4444                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4445                exomiser_analysis_analysis = os.path.join(
4446                    tmp_dir, "analysis_analysis.json"
4447                )
4448                with open(exomiser_analysis_analysis, "w") as fp:
4449                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4450
4451                ### INITAL VCF file ###
4452                #######################
4453
4454                ### Create list of samples to use and include inti initial VCF file ####
4455
4456                # Subject (main sample)
4457                # Get sample ID in analysis dict
4458                sample_subject = (
4459                    param_exomiser_analysis_dict.get("phenopacket", {})
4460                    .get("subject", {})
4461                    .get("id", None)
4462                )
4463                sample_proband = (
4464                    param_exomiser_analysis_dict.get("phenopacket", {})
4465                    .get("proband", {})
4466                    .get("subject", {})
4467                    .get("id", None)
4468                )
4469                sample = []
4470                if sample_subject:
4471                    sample.append(sample_subject)
4472                if sample_proband:
4473                    sample.append(sample_proband)
4474
4475                # Get sample ID within Pedigree
4476                pedigree_persons_list = (
4477                    param_exomiser_analysis_dict.get("phenopacket", {})
4478                    .get("pedigree", {})
4479                    .get("persons", {})
4480                )
4481
4482                # Create list with all sample ID in pedigree (if exists)
4483                pedigree_persons = []
4484                for person in pedigree_persons_list:
4485                    pedigree_persons.append(person.get("individualId"))
4486
4487                # Concat subject sample ID and samples ID in pedigreesamples
4488                samples = list(set(sample + pedigree_persons))
4489
4490                # Check if sample list is not empty
4491                if not samples:
4492                    log.error(f"No samples found")
4493                    raise ValueError(f"No samples found")
4494
4495                # Create VCF with sample (either sample in param or first one by default)
4496                # Export VCF file
4497                self.export_variant_vcf(
4498                    vcf_file=tmp_vcf_name,
4499                    remove_info=True,
4500                    add_samples=True,
4501                    list_samples=samples,
4502                    index=False,
4503                )
4504
4505                ### Execute Exomiser ###
4506                ########################
4507
4508                # Init command
4509                exomiser_command = ""
4510
4511                # Command exomiser options
4512                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
4513
4514                # Release
4515                exomiser_release = param_exomiser.get("release", None)
4516                if exomiser_release:
4517                    # phenotype data version
4518                    exomiser_options += (
4519                        f" --exomiser.phenotype.data-version={exomiser_release} "
4520                    )
4521                    # data version
4522                    exomiser_options += (
4523                        f" --exomiser.{assembly}.data-version={exomiser_release} "
4524                    )
4525                    # variant white list
4526                    variant_white_list_file = (
4527                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
4528                    )
4529                    if os.path.exists(
4530                        os.path.join(
4531                            databases_folders, assembly, variant_white_list_file
4532                        )
4533                    ):
4534                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
4535
4536                # transcript_source
4537                transcript_source = param_exomiser.get(
4538                    "transcript_source", None
4539                )  # ucsc, refseq, ensembl
4540                if transcript_source:
4541                    exomiser_options += (
4542                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
4543                    )
4544
4545                # If analysis contain proband param
4546                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
4547                    "proband", {}
4548                ):
4549                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
4550
4551                # If no proband (usually uniq sample)
4552                else:
4553                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
4554
4555                # Log
4556                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
4557
4558                # Run command
4559                result = subprocess.call(
4560                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
4561                )
4562                if result:
4563                    log.error("Exomiser command failed")
4564                    raise ValueError("Exomiser command failed")
4565
4566                ### RESULTS ###
4567                ###############
4568
4569                ### Annotate with TSV fields ###
4570
4571                # Init result tsv file
4572                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
4573
4574                # Init result tsv file
4575                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
4576
4577                # Parse TSV file and explode columns in INFO field
4578                if exomiser_to_info and os.path.exists(output_results_tsv):
4579
4580                    # Log
4581                    log.debug("Exomiser columns to VCF INFO field")
4582
4583                    # Retrieve columns and types
4584                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
4585                    output_results_tsv_df = self.get_query_to_df(query)
4586                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
4587
4588                    # Init concat fields for update
4589                    sql_query_update_concat_fields = []
4590
4591                    # Fields to avoid
4592                    fields_to_avoid = [
4593                        "CONTIG",
4594                        "START",
4595                        "END",
4596                        "REF",
4597                        "ALT",
4598                        "QUAL",
4599                        "FILTER",
4600                        "GENOTYPE",
4601                    ]
4602
4603                    # List all columns to add into header
4604                    for header_column in output_results_tsv_columns:
4605
4606                        # If header column is enable
4607                        if header_column not in fields_to_avoid:
4608
4609                            # Header info type
4610                            header_info_type = "String"
4611                            header_column_df = output_results_tsv_df[header_column]
4612                            header_column_df_dtype = header_column_df.dtype
4613                            if header_column_df_dtype == object:
4614                                if (
4615                                    pd.to_numeric(header_column_df, errors="coerce")
4616                                    .notnull()
4617                                    .all()
4618                                ):
4619                                    header_info_type = "Float"
4620                            else:
4621                                header_info_type = "Integer"
4622
4623                            # Header info
4624                            characters_to_validate = ["-"]
4625                            pattern = "[" + "".join(characters_to_validate) + "]"
4626                            header_info_name = re.sub(
4627                                pattern,
4628                                "_",
4629                                f"Exomiser_{header_column}".replace("#", ""),
4630                            )
4631                            header_info_number = "."
4632                            header_info_description = (
4633                                f"Exomiser {header_column} annotation"
4634                            )
4635                            header_info_source = "Exomiser"
4636                            header_info_version = "unknown"
4637                            header_info_code = CODE_TYPE_MAP[header_info_type]
4638                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
4639                                header_info_name,
4640                                header_info_number,
4641                                header_info_type,
4642                                header_info_description,
4643                                header_info_source,
4644                                header_info_version,
4645                                header_info_code,
4646                            )
4647
4648                            # Add field to add for update to concat fields
4649                            sql_query_update_concat_fields.append(
4650                                f"""
4651                                CASE
4652                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
4653                                    THEN concat(
4654                                        '{header_info_name}=',
4655                                        table_parquet."{header_column}",
4656                                        ';'
4657                                        )
4658
4659                                    ELSE ''
4660                                END
4661                            """
4662                            )
4663
4664                    # Update query
4665                    sql_query_update = f"""
4666                        UPDATE {table_variants} as table_variants
4667                            SET INFO = concat(
4668                                            CASE
4669                                                WHEN INFO NOT IN ('', '.')
4670                                                THEN INFO
4671                                                ELSE ''
4672                                            END,
4673                                            CASE
4674                                                WHEN table_variants.INFO NOT IN ('','.')
4675                                                THEN ';'
4676                                                ELSE ''
4677                                            END,
4678                                            (
4679                                            SELECT 
4680                                                concat(
4681                                                    {",".join(sql_query_update_concat_fields)}
4682                                                )
4683                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
4684                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
4685                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
4686                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
4687                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
4688                                            )
4689                                        )
4690                            ;
4691                        """
4692
4693                    # Update
4694                    self.conn.execute(sql_query_update)
4695
4696                ### Annotate with VCF INFO field ###
4697
4698                # Init result VCF file
4699                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
4700
4701                # If VCF exists
4702                if os.path.exists(output_results_vcf):
4703
4704                    # Log
4705                    log.debug("Exomiser result VCF update variants")
4706
4707                    # Find Exomiser INFO field annotation in header
4708                    with gzip.open(output_results_vcf, "rt") as f:
4709                        header_list = self.read_vcf_header(f)
4710                    exomiser_vcf_header = vcf.Reader(
4711                        io.StringIO("\n".join(header_list))
4712                    )
4713
4714                    # Add annotation INFO field to header
4715                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
4716
4717                    # Update variants with VCF
4718                    self.update_from_vcf(output_results_vcf)
4719
4720        return True

This function annotate with Exomiser

This function uses args as parameters, in section "annotation" -> "exomiser", with sections:

  • "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
  • "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
  • "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
  • "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
  • "sample" (string): Sample name to construct "subject" section: "subject": { "id": "", "sex": "UNKNOWN_SEX" } Default: None
  • "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
  • "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
  • "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
  • "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
  • "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
  • "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
  • "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).

Notes:

  • If no sample in parameters, first sample in VCF will be chosen
  • If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
  • threads: The number of threads to use
Returns

None.

def annotation_snpeff(self, threads: int = None) -> None:
4722    def annotation_snpeff(self, threads: int = None) -> None:
4723        """
4724        This function annotate with snpEff
4725
4726        :param threads: The number of threads to use
4727        :return: the value of the variable "return_value".
4728        """
4729
4730        # DEBUG
4731        log.debug("Start annotation with snpeff databases")
4732
4733        # Threads
4734        if not threads:
4735            threads = self.get_threads()
4736        log.debug("Threads: " + str(threads))
4737
4738        # DEBUG
4739        delete_tmp = True
4740        if self.get_config().get("verbosity", "warning") in ["debug"]:
4741            delete_tmp = False
4742            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4743
4744        # Config
4745        config = self.get_config()
4746        log.debug("Config: " + str(config))
4747
4748        # Config - Folders - Databases
4749        databases_folders = (
4750            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
4751        )
4752        log.debug("Databases annotations: " + str(databases_folders))
4753
4754        # # Config - Java
4755        # java_bin = get_bin(
4756        #     tool="java",
4757        #     bin="java",
4758        #     bin_type="bin",
4759        #     config=config,
4760        #     default_folder="/usr/bin",
4761        # )
4762        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
4763        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
4764        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
4765
4766        # # Config - snpEff bin
4767        # snpeff_jar = get_bin(
4768        #     tool="snpeff",
4769        #     bin="snpEff.jar",
4770        #     bin_type="jar",
4771        #     config=config,
4772        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4773        # )
4774        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
4775        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4776        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4777
4778        # Config - snpEff bin command
4779        snpeff_bin_command = get_bin_command(
4780            bin="snpEff.jar",
4781            tool="snpeff",
4782            bin_type="jar",
4783            config=config,
4784            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4785        )
4786        if not snpeff_bin_command:
4787            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
4788            log.error(msg_err)
4789            raise ValueError(msg_err)
4790
4791        # Config - snpEff databases
4792        snpeff_databases = (
4793            config.get("folders", {})
4794            .get("databases", {})
4795            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
4796        )
4797        snpeff_databases = full_path(snpeff_databases)
4798        if snpeff_databases is not None and snpeff_databases != "":
4799            log.debug(f"Create snpEff databases folder")
4800            if not os.path.exists(snpeff_databases):
4801                os.makedirs(snpeff_databases)
4802
4803        # Param
4804        param = self.get_param()
4805        log.debug("Param: " + str(param))
4806
4807        # Param
4808        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
4809        log.debug("Options: " + str(options))
4810
4811        # Param - Assembly
4812        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4813
4814        # Param - Options
4815        snpeff_options = (
4816            param.get("annotation", {}).get("snpeff", {}).get("options", "")
4817        )
4818        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
4819        snpeff_csvstats = (
4820            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
4821        )
4822        if snpeff_stats:
4823            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
4824            snpeff_stats = full_path(snpeff_stats)
4825            snpeff_options += f" -stats {snpeff_stats}"
4826        if snpeff_csvstats:
4827            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
4828            snpeff_csvstats = full_path(snpeff_csvstats)
4829            snpeff_options += f" -csvStats {snpeff_csvstats}"
4830
4831        # Data
4832        table_variants = self.get_table_variants()
4833
4834        # Check if not empty
4835        log.debug("Check if not empty")
4836        sql_query_chromosomes = (
4837            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4838        )
4839        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
4840        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4841            log.info(f"VCF empty")
4842            return
4843
4844        # Export in VCF
4845        log.debug("Create initial file to annotate")
4846        tmp_vcf = NamedTemporaryFile(
4847            prefix=self.get_prefix(),
4848            dir=self.get_tmp_dir(),
4849            suffix=".vcf.gz",
4850            delete=True,
4851        )
4852        tmp_vcf_name = tmp_vcf.name
4853
4854        # VCF header
4855        vcf_reader = self.get_header()
4856        log.debug("Initial header: " + str(vcf_reader.infos))
4857
4858        # Existing annotations
4859        for vcf_annotation in self.get_header().infos:
4860
4861            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4862            log.debug(
4863                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4864            )
4865
4866        # Memory limit
4867        # if config.get("memory", None):
4868        #     memory_limit = config.get("memory", "8G")
4869        # else:
4870        #     memory_limit = "8G"
4871        memory_limit = self.get_memory("8G")
4872        log.debug(f"memory_limit: {memory_limit}")
4873
4874        # snpEff java options
4875        snpeff_java_options = (
4876            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4877        )
4878        log.debug(f"Exomiser java options: {snpeff_java_options}")
4879
4880        force_update_annotation = True
4881
4882        if "ANN" not in self.get_header().infos or force_update_annotation:
4883
4884            # Check snpEff database
4885            log.debug(f"Check snpEff databases {[assembly]}")
4886            databases_download_snpeff(
4887                folder=snpeff_databases, assemblies=[assembly], config=config
4888            )
4889
4890            # Export VCF file
4891            self.export_variant_vcf(
4892                vcf_file=tmp_vcf_name,
4893                remove_info=True,
4894                add_samples=False,
4895                index=True,
4896            )
4897
4898            # Tmp file
4899            err_files = []
4900            tmp_annotate_vcf = NamedTemporaryFile(
4901                prefix=self.get_prefix(),
4902                dir=self.get_tmp_dir(),
4903                suffix=".vcf",
4904                delete=False,
4905            )
4906            tmp_annotate_vcf_name = tmp_annotate_vcf.name
4907            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4908            err_files.append(tmp_annotate_vcf_name_err)
4909
4910            # Command
4911            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
4912            log.debug(f"Annotation - snpEff command: {snpeff_command}")
4913            run_parallel_commands([snpeff_command], 1)
4914
4915            # Error messages
4916            log.info(f"Error/Warning messages:")
4917            error_message_command_all = []
4918            error_message_command_warning = []
4919            error_message_command_err = []
4920            for err_file in err_files:
4921                with open(err_file, "r") as f:
4922                    for line in f:
4923                        message = line.strip()
4924                        error_message_command_all.append(message)
4925                        if line.startswith("[W::"):
4926                            error_message_command_warning.append(message)
4927                        if line.startswith("[E::"):
4928                            error_message_command_err.append(f"{err_file}: " + message)
4929            # log info
4930            for message in list(
4931                set(error_message_command_err + error_message_command_warning)
4932            ):
4933                log.info(f"   {message}")
4934            # debug info
4935            for message in list(set(error_message_command_all)):
4936                log.debug(f"   {message}")
4937            # failed
4938            if len(error_message_command_err):
4939                log.error("Annotation failed: Error in commands")
4940                raise ValueError("Annotation failed: Error in commands")
4941
4942            # Find annotation in header
4943            with open(tmp_annotate_vcf_name, "rt") as f:
4944                header_list = self.read_vcf_header(f)
4945            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
4946
4947            for ann in annovar_vcf_header.infos:
4948                if ann not in self.get_header().infos:
4949                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
4950
4951            # Update variants
4952            log.info(f"Annotation - Updating...")
4953            self.update_from_vcf(tmp_annotate_vcf_name)
4954
4955        else:
4956            if "ANN" in self.get_header().infos:
4957                log.debug(f"Existing snpEff annotations in VCF")
4958            if force_update_annotation:
4959                log.debug(f"Existing snpEff annotations in VCF - annotation forced")

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def annotation_annovar(self, threads: int = None) -> None:
4961    def annotation_annovar(self, threads: int = None) -> None:
4962        """
4963        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
4964        annotations
4965
4966        :param threads: number of threads to use
4967        :return: the value of the variable "return_value".
4968        """
4969
4970        # DEBUG
4971        log.debug("Start annotation with Annovar databases")
4972
4973        # Threads
4974        if not threads:
4975            threads = self.get_threads()
4976        log.debug("Threads: " + str(threads))
4977
4978        # Tmp en Err files
4979        tmp_files = []
4980        err_files = []
4981
4982        # DEBUG
4983        delete_tmp = True
4984        if self.get_config().get("verbosity", "warning") in ["debug"]:
4985            delete_tmp = False
4986            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4987
4988        # Config
4989        config = self.get_config()
4990        log.debug("Config: " + str(config))
4991
4992        # Config - Folders - Databases
4993        databases_folders = (
4994            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
4995        )
4996        log.debug("Databases annotations: " + str(databases_folders))
4997
4998        # Config - annovar bin command
4999        annovar_bin_command = get_bin_command(
5000            bin="table_annovar.pl",
5001            tool="annovar",
5002            bin_type="perl",
5003            config=config,
5004            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
5005        )
5006        if not annovar_bin_command:
5007            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
5008            log.error(msg_err)
5009            raise ValueError(msg_err)
5010
5011        # Config - BCFTools bin command
5012        bcftools_bin_command = get_bin_command(
5013            bin="bcftools",
5014            tool="bcftools",
5015            bin_type="bin",
5016            config=config,
5017            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
5018        )
5019        if not bcftools_bin_command:
5020            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
5021            log.error(msg_err)
5022            raise ValueError(msg_err)
5023
5024        # Config - annovar databases
5025        annovar_databases = (
5026            config.get("folders", {})
5027            .get("databases", {})
5028            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
5029        )
5030        annovar_databases = full_path(annovar_databases)
5031        if annovar_databases != "" and not os.path.exists(annovar_databases):
5032            os.makedirs(annovar_databases)
5033
5034        # Param
5035        param = self.get_param()
5036        log.debug("Param: " + str(param))
5037
5038        # Param - options
5039        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
5040        log.debug("Options: " + str(options))
5041
5042        # Param - annotations
5043        annotations = (
5044            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
5045        )
5046        log.debug("Annotations: " + str(annotations))
5047
5048        # Param - Assembly
5049        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5050
5051        # Annovar database assembly
5052        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
5053        if annovar_databases_assembly != "" and not os.path.exists(
5054            annovar_databases_assembly
5055        ):
5056            os.makedirs(annovar_databases_assembly)
5057
5058        # Data
5059        table_variants = self.get_table_variants()
5060
5061        # Check if not empty
5062        log.debug("Check if not empty")
5063        sql_query_chromosomes = (
5064            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5065        )
5066        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
5067        if not sql_query_chromosomes_df["count"][0]:
5068            log.info(f"VCF empty")
5069            return
5070
5071        # VCF header
5072        vcf_reader = self.get_header()
5073        log.debug("Initial header: " + str(vcf_reader.infos))
5074
5075        # Existing annotations
5076        for vcf_annotation in self.get_header().infos:
5077
5078            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5079            log.debug(
5080                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5081            )
5082
5083        force_update_annotation = True
5084
5085        if annotations:
5086
5087            commands = []
5088            tmp_annotates_vcf_name_list = []
5089
5090            # Export in VCF
5091            log.debug("Create initial file to annotate")
5092            tmp_vcf = NamedTemporaryFile(
5093                prefix=self.get_prefix(),
5094                dir=self.get_tmp_dir(),
5095                suffix=".vcf.gz",
5096                delete=False,
5097            )
5098            tmp_vcf_name = tmp_vcf.name
5099            tmp_files.append(tmp_vcf_name)
5100            tmp_files.append(tmp_vcf_name + ".tbi")
5101
5102            # Export VCF file
5103            self.export_variant_vcf(
5104                vcf_file=tmp_vcf_name,
5105                remove_info=".",
5106                add_samples=False,
5107                index=True,
5108            )
5109
5110            # Create file for field rename
5111            log.debug("Create file for field rename")
5112            tmp_rename = NamedTemporaryFile(
5113                prefix=self.get_prefix(),
5114                dir=self.get_tmp_dir(),
5115                suffix=".rename",
5116                delete=False,
5117            )
5118            tmp_rename_name = tmp_rename.name
5119            tmp_files.append(tmp_rename_name)
5120
5121            # Check Annovar database
5122            log.debug(
5123                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5124            )
5125            databases_download_annovar(
5126                folder=annovar_databases,
5127                files=list(annotations.keys()),
5128                assemblies=[assembly],
5129            )
5130
5131            for annotation in annotations:
5132                annotation_fields = annotations[annotation]
5133
5134                if not annotation_fields:
5135                    annotation_fields = {"INFO": None}
5136
5137                log.info(f"Annotations Annovar - database '{annotation}'")
5138                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5139
5140                # Tmp file for annovar
5141                err_files = []
5142                tmp_annotate_vcf_directory = TemporaryDirectory(
5143                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5144                )
5145                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5146                tmp_annotate_vcf_name_annovar = (
5147                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5148                )
5149                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5150                err_files.append(tmp_annotate_vcf_name_err)
5151                tmp_files.append(tmp_annotate_vcf_name_err)
5152
5153                # Tmp file final vcf annotated by annovar
5154                tmp_annotate_vcf = NamedTemporaryFile(
5155                    prefix=self.get_prefix(),
5156                    dir=self.get_tmp_dir(),
5157                    suffix=".vcf.gz",
5158                    delete=False,
5159                )
5160                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5161                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5162                tmp_files.append(tmp_annotate_vcf_name)
5163                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5164
5165                # Number of fields
5166                annotation_list = []
5167                annotation_renamed_list = []
5168
5169                for annotation_field in annotation_fields:
5170
5171                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5172                    annotation_fields_new_name = annotation_fields.get(
5173                        annotation_field, annotation_field
5174                    )
5175                    if not annotation_fields_new_name:
5176                        annotation_fields_new_name = annotation_field
5177
5178                    if (
5179                        force_update_annotation
5180                        or annotation_fields_new_name not in self.get_header().infos
5181                    ):
5182                        annotation_list.append(annotation_field)
5183                        annotation_renamed_list.append(annotation_fields_new_name)
5184                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5185                        log.warning(
5186                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5187                        )
5188
5189                    # Add rename info
5190                    run_parallel_commands(
5191                        [
5192                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5193                        ],
5194                        1,
5195                    )
5196
5197                # log.debug("fields_to_removed: " + str(fields_to_removed))
5198                log.debug("annotation_list: " + str(annotation_list))
5199
5200                # protocol
5201                protocol = annotation
5202
5203                # argument
5204                argument = ""
5205
5206                # operation
5207                operation = "f"
5208                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5209                    "ensGene"
5210                ):
5211                    operation = "g"
5212                    if options.get("genebase", None):
5213                        argument = f"""'{options.get("genebase","")}'"""
5214                elif annotation in ["cytoBand"]:
5215                    operation = "r"
5216
5217                # argument option
5218                argument_option = ""
5219                if argument != "":
5220                    argument_option = " --argument " + argument
5221
5222                # command options
5223                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5224                for option in options:
5225                    if option not in ["genebase"]:
5226                        command_options += f""" --{option}={options[option]}"""
5227
5228                # Command
5229
5230                # Command - Annovar
5231                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5232                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5233
5234                # Command - start pipe
5235                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5236
5237                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5238                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5239
5240                # Command - Special characters (refGene annotation)
5241                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5242
5243                # Command - Clean empty fields (with value ".")
5244                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5245
5246                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5247                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5248                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5249                    # for ann in annotation_renamed_list:
5250                    for ann in annotation_list:
5251                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5252
5253                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5254
5255                # Command - indexing
5256                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5257
5258                log.debug(f"Annotation - Annovar command: {command_annovar}")
5259                run_parallel_commands([command_annovar], 1)
5260
5261                # Error messages
5262                log.info(f"Error/Warning messages:")
5263                error_message_command_all = []
5264                error_message_command_warning = []
5265                error_message_command_err = []
5266                for err_file in err_files:
5267                    with open(err_file, "r") as f:
5268                        for line in f:
5269                            message = line.strip()
5270                            error_message_command_all.append(message)
5271                            if line.startswith("[W::") or line.startswith("WARNING"):
5272                                error_message_command_warning.append(message)
5273                            if line.startswith("[E::") or line.startswith("ERROR"):
5274                                error_message_command_err.append(
5275                                    f"{err_file}: " + message
5276                                )
5277                # log info
5278                for message in list(
5279                    set(error_message_command_err + error_message_command_warning)
5280                ):
5281                    log.info(f"   {message}")
5282                # debug info
5283                for message in list(set(error_message_command_all)):
5284                    log.debug(f"   {message}")
5285                # failed
5286                if len(error_message_command_err):
5287                    log.error("Annotation failed: Error in commands")
5288                    raise ValueError("Annotation failed: Error in commands")
5289
5290            if tmp_annotates_vcf_name_list:
5291
5292                # List of annotated files
5293                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5294
5295                # Tmp file
5296                tmp_annotate_vcf = NamedTemporaryFile(
5297                    prefix=self.get_prefix(),
5298                    dir=self.get_tmp_dir(),
5299                    suffix=".vcf.gz",
5300                    delete=False,
5301                )
5302                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5303                tmp_files.append(tmp_annotate_vcf_name)
5304                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5305                err_files.append(tmp_annotate_vcf_name_err)
5306                tmp_files.append(tmp_annotate_vcf_name_err)
5307
5308                # Command merge
5309                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5310                log.info(
5311                    f"Annotation Annovar - Annotation merging "
5312                    + str(len(tmp_annotates_vcf_name_list))
5313                    + " annotated files"
5314                )
5315                log.debug(f"Annotation - merge command: {merge_command}")
5316                run_parallel_commands([merge_command], 1)
5317
5318                # Find annotation in header
5319                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5320                    header_list = self.read_vcf_header(f)
5321                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5322
5323                for ann in annovar_vcf_header.infos:
5324                    if ann not in self.get_header().infos:
5325                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5326
5327                # Update variants
5328                log.info(f"Annotation Annovar - Updating...")
5329                self.update_from_vcf(tmp_annotate_vcf_name)
5330
5331            # Clean files
5332            # Tmp file remove command
5333            if True:
5334                tmp_files_remove_command = ""
5335                if tmp_files:
5336                    tmp_files_remove_command = " ".join(tmp_files)
5337                clean_command = f" rm -f {tmp_files_remove_command} "
5338                log.debug(f"Annotation Annovar - Annotation cleaning ")
5339                log.debug(f"Annotation - cleaning command: {clean_command}")
5340                run_parallel_commands([clean_command], 1)

It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations

Parameters
  • threads: number of threads to use
Returns

the value of the variable "return_value".

def annotation_parquet(self, threads: int = None) -> None:
5343    def annotation_parquet(self, threads: int = None) -> None:
5344        """
5345        It takes a VCF file, and annotates it with a parquet file
5346
5347        :param threads: number of threads to use for the annotation
5348        :return: the value of the variable "result".
5349        """
5350
5351        # DEBUG
5352        log.debug("Start annotation with parquet databases")
5353
5354        # Threads
5355        if not threads:
5356            threads = self.get_threads()
5357        log.debug("Threads: " + str(threads))
5358
5359        # DEBUG
5360        delete_tmp = True
5361        if self.get_config().get("verbosity", "warning") in ["debug"]:
5362            delete_tmp = False
5363            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5364
5365        # Config
5366        databases_folders = set(
5367            self.get_config()
5368            .get("folders", {})
5369            .get("databases", {})
5370            .get("annotations", ["."])
5371            + self.get_config()
5372            .get("folders", {})
5373            .get("databases", {})
5374            .get("parquet", ["."])
5375        )
5376        log.debug("Databases annotations: " + str(databases_folders))
5377
5378        # Param
5379        annotations = (
5380            self.get_param()
5381            .get("annotation", {})
5382            .get("parquet", {})
5383            .get("annotations", None)
5384        )
5385        log.debug("Annotations: " + str(annotations))
5386
5387        # Assembly
5388        assembly = self.get_param().get(
5389            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5390        )
5391
5392        # Force Update Annotation
5393        force_update_annotation = (
5394            self.get_param()
5395            .get("annotation", {})
5396            .get("options", {})
5397            .get("annotations_update", False)
5398        )
5399        log.debug(f"force_update_annotation={force_update_annotation}")
5400        force_append_annotation = (
5401            self.get_param()
5402            .get("annotation", {})
5403            .get("options", {})
5404            .get("annotations_append", False)
5405        )
5406        log.debug(f"force_append_annotation={force_append_annotation}")
5407
5408        # Data
5409        table_variants = self.get_table_variants()
5410
5411        # Check if not empty
5412        log.debug("Check if not empty")
5413        sql_query_chromosomes_df = self.get_query_to_df(
5414            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5415        )
5416        if not sql_query_chromosomes_df["count"][0]:
5417            log.info(f"VCF empty")
5418            return
5419
5420        # VCF header
5421        vcf_reader = self.get_header()
5422        log.debug("Initial header: " + str(vcf_reader.infos))
5423
5424        # Nb Variants POS
5425        log.debug("NB Variants Start")
5426        nb_variants = self.conn.execute(
5427            f"SELECT count(*) AS count FROM variants"
5428        ).fetchdf()["count"][0]
5429        log.debug("NB Variants Stop")
5430
5431        # Existing annotations
5432        for vcf_annotation in self.get_header().infos:
5433
5434            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5435            log.debug(
5436                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5437            )
5438
5439        # Added columns
5440        added_columns = []
5441
5442        # drop indexes
5443        log.debug(f"Drop indexes...")
5444        self.drop_indexes()
5445
5446        if annotations:
5447
5448            if "ALL" in annotations:
5449
5450                all_param = annotations.get("ALL", {})
5451                all_param_formats = all_param.get("formats", None)
5452                all_param_releases = all_param.get("releases", None)
5453
5454                databases_infos_dict = self.scan_databases(
5455                    database_formats=all_param_formats,
5456                    database_releases=all_param_releases,
5457                )
5458                for database_infos in databases_infos_dict.keys():
5459                    if database_infos not in annotations:
5460                        annotations[database_infos] = {"INFO": None}
5461
5462            for annotation in annotations:
5463
5464                if annotation in ["ALL"]:
5465                    continue
5466
5467                # Annotation Name
5468                annotation_name = os.path.basename(annotation)
5469
5470                # Annotation fields
5471                annotation_fields = annotations[annotation]
5472                if not annotation_fields:
5473                    annotation_fields = {"INFO": None}
5474
5475                log.debug(f"Annotation '{annotation_name}'")
5476                log.debug(
5477                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5478                )
5479
5480                # Create Database
5481                database = Database(
5482                    database=annotation,
5483                    databases_folders=databases_folders,
5484                    assembly=assembly,
5485                )
5486
5487                # Find files
5488                parquet_file = database.get_database()
5489                parquet_hdr_file = database.get_header_file()
5490                parquet_type = database.get_type()
5491
5492                # Check if files exists
5493                if not parquet_file or not parquet_hdr_file:
5494                    log.error("Annotation failed: file not found")
5495                    raise ValueError("Annotation failed: file not found")
5496                else:
5497                    # Get parquet connexion
5498                    parquet_sql_attach = database.get_sql_database_attach(
5499                        output="query"
5500                    )
5501                    if parquet_sql_attach:
5502                        self.conn.execute(parquet_sql_attach)
5503                    parquet_file_link = database.get_sql_database_link()
5504                    # Log
5505                    log.debug(
5506                        f"Annotation '{annotation_name}' - file: "
5507                        + str(parquet_file)
5508                        + " and "
5509                        + str(parquet_hdr_file)
5510                    )
5511
5512                    # Database full header columns
5513                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
5514                        parquet_hdr_file
5515                    )
5516                    # Log
5517                    log.debug(
5518                        "Annotation database header columns : "
5519                        + str(parquet_hdr_vcf_header_columns)
5520                    )
5521
5522                    # Load header as VCF object
5523                    parquet_hdr_vcf_header_infos = database.get_header().infos
5524                    # Log
5525                    log.debug(
5526                        "Annotation database header: "
5527                        + str(parquet_hdr_vcf_header_infos)
5528                    )
5529
5530                    # Get extra infos
5531                    parquet_columns = database.get_extra_columns()
5532                    # Log
5533                    log.debug("Annotation database Columns: " + str(parquet_columns))
5534
5535                    # Add extra columns if "ALL" in annotation_fields
5536                    # if "ALL" in annotation_fields:
5537                    #     allow_add_extra_column = True
5538                    if "ALL" in annotation_fields and database.get_extra_columns():
5539                        for extra_column in database.get_extra_columns():
5540                            if (
5541                                extra_column not in annotation_fields
5542                                and extra_column.replace("INFO/", "")
5543                                not in parquet_hdr_vcf_header_infos
5544                            ):
5545                                parquet_hdr_vcf_header_infos[extra_column] = (
5546                                    vcf.parser._Info(
5547                                        extra_column,
5548                                        ".",
5549                                        "String",
5550                                        f"{extra_column} description",
5551                                        "unknown",
5552                                        "unknown",
5553                                        self.code_type_map["String"],
5554                                    )
5555                                )
5556
5557                    # For all fields in database
5558                    annotation_fields_all = False
5559                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
5560                        annotation_fields_all = True
5561                        annotation_fields = {
5562                            key: key for key in parquet_hdr_vcf_header_infos
5563                        }
5564
5565                        log.debug(
5566                            "Annotation database header - All annotations added: "
5567                            + str(annotation_fields)
5568                        )
5569
5570                    # Init
5571
5572                    # List of annotation fields to use
5573                    sql_query_annotation_update_info_sets = []
5574
5575                    # List of annotation to agregate
5576                    sql_query_annotation_to_agregate = []
5577
5578                    # Number of fields
5579                    nb_annotation_field = 0
5580
5581                    # Annotation fields processed
5582                    annotation_fields_processed = []
5583
5584                    # Columns mapping
5585                    map_columns = database.map_columns(
5586                        columns=annotation_fields, prefixes=["INFO/"]
5587                    )
5588
5589                    # Query dict for fields to remove (update option)
5590                    query_dict_remove = {}
5591
5592                    # Fetch Anotation fields
5593                    for annotation_field in annotation_fields:
5594
5595                        # annotation_field_column
5596                        annotation_field_column = map_columns.get(
5597                            annotation_field, "INFO"
5598                        )
5599
5600                        # field new name, if parametered
5601                        annotation_fields_new_name = annotation_fields.get(
5602                            annotation_field, annotation_field
5603                        )
5604                        if not annotation_fields_new_name:
5605                            annotation_fields_new_name = annotation_field
5606
5607                        # To annotate
5608                        # force_update_annotation = True
5609                        # force_append_annotation = True
5610                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
5611                        if annotation_field in parquet_hdr_vcf_header_infos and (
5612                            force_update_annotation
5613                            or force_append_annotation
5614                            or (
5615                                annotation_fields_new_name
5616                                not in self.get_header().infos
5617                            )
5618                        ):
5619
5620                            # Add field to annotation to process list
5621                            annotation_fields_processed.append(
5622                                annotation_fields_new_name
5623                            )
5624
5625                            # explode infos for the field
5626                            annotation_fields_new_name_info_msg = ""
5627                            if (
5628                                force_update_annotation
5629                                and annotation_fields_new_name
5630                                in self.get_header().infos
5631                            ):
5632                                # Remove field from INFO
5633                                query = f"""
5634                                    UPDATE {table_variants} as table_variants
5635                                    SET INFO = REGEXP_REPLACE(
5636                                                concat(table_variants.INFO,''),
5637                                                ';*{annotation_fields_new_name}=[^;]*',
5638                                                ''
5639                                                )
5640                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
5641                                """
5642                                annotation_fields_new_name_info_msg = " [update]"
5643                                query_dict_remove[
5644                                    f"remove 'INFO/{annotation_fields_new_name}'"
5645                                ] = query
5646
5647                            # Sep between fields in INFO
5648                            nb_annotation_field += 1
5649                            if nb_annotation_field > 1:
5650                                annotation_field_sep = ";"
5651                            else:
5652                                annotation_field_sep = ""
5653
5654                            log.info(
5655                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
5656                            )
5657
5658                            # Add INFO field to header
5659                            parquet_hdr_vcf_header_infos_number = (
5660                                parquet_hdr_vcf_header_infos[annotation_field].num
5661                                or "."
5662                            )
5663                            parquet_hdr_vcf_header_infos_type = (
5664                                parquet_hdr_vcf_header_infos[annotation_field].type
5665                                or "String"
5666                            )
5667                            parquet_hdr_vcf_header_infos_description = (
5668                                parquet_hdr_vcf_header_infos[annotation_field].desc
5669                                or f"{annotation_field} description"
5670                            )
5671                            parquet_hdr_vcf_header_infos_source = (
5672                                parquet_hdr_vcf_header_infos[annotation_field].source
5673                                or "unknown"
5674                            )
5675                            parquet_hdr_vcf_header_infos_version = (
5676                                parquet_hdr_vcf_header_infos[annotation_field].version
5677                                or "unknown"
5678                            )
5679
5680                            vcf_reader.infos[annotation_fields_new_name] = (
5681                                vcf.parser._Info(
5682                                    annotation_fields_new_name,
5683                                    parquet_hdr_vcf_header_infos_number,
5684                                    parquet_hdr_vcf_header_infos_type,
5685                                    parquet_hdr_vcf_header_infos_description,
5686                                    parquet_hdr_vcf_header_infos_source,
5687                                    parquet_hdr_vcf_header_infos_version,
5688                                    self.code_type_map[
5689                                        parquet_hdr_vcf_header_infos_type
5690                                    ],
5691                                )
5692                            )
5693
5694                            # Append
5695                            if force_append_annotation:
5696                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
5697                            else:
5698                                query_case_when_append = ""
5699
5700                            # Annotation/Update query fields
5701                            # Found in INFO column
5702                            if (
5703                                annotation_field_column == "INFO"
5704                                and "INFO" in parquet_hdr_vcf_header_columns
5705                            ):
5706                                sql_query_annotation_update_info_sets.append(
5707                                    f"""
5708                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
5709                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
5710                                        ELSE ''
5711                                    END
5712                                """
5713                                )
5714                            # Found in a specific column
5715                            else:
5716                                sql_query_annotation_update_info_sets.append(
5717                                    f"""
5718                                CASE WHEN table_parquet."{annotation_field_column}" NOT IN ('','.') {query_case_when_append}
5719                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(table_parquet."{annotation_field_column}", ';', ','))
5720                                        ELSE ''
5721                                    END
5722                                """
5723                                )
5724                                sql_query_annotation_to_agregate.append(
5725                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
5726                                )
5727
5728                        # Not to annotate
5729                        else:
5730
5731                            if force_update_annotation:
5732                                annotation_message = "forced"
5733                            else:
5734                                annotation_message = "skipped"
5735
5736                            if annotation_field not in parquet_hdr_vcf_header_infos:
5737                                log.warning(
5738                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
5739                                )
5740                            if annotation_fields_new_name in self.get_header().infos:
5741                                log.warning(
5742                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
5743                                )
5744
5745                    # Check if ALL fields have to be annotated. Thus concat all INFO field
5746                    # allow_annotation_full_info = True
5747                    allow_annotation_full_info = not force_append_annotation
5748
5749                    if parquet_type in ["regions"]:
5750                        allow_annotation_full_info = False
5751
5752                    if (
5753                        allow_annotation_full_info
5754                        and nb_annotation_field == len(annotation_fields)
5755                        and annotation_fields_all
5756                        and (
5757                            "INFO" in parquet_hdr_vcf_header_columns
5758                            and "INFO" in database.get_extra_columns()
5759                        )
5760                    ):
5761                        log.debug("Column INFO annotation enabled")
5762                        sql_query_annotation_update_info_sets = []
5763                        sql_query_annotation_update_info_sets.append(
5764                            f" table_parquet.INFO "
5765                        )
5766
5767                    if sql_query_annotation_update_info_sets:
5768
5769                        # Annotate
5770                        log.info(f"Annotation '{annotation_name}' - Annotation...")
5771
5772                        # Join query annotation update info sets for SQL
5773                        sql_query_annotation_update_info_sets_sql = ",".join(
5774                            sql_query_annotation_update_info_sets
5775                        )
5776
5777                        # Check chromosomes list (and variants infos)
5778                        sql_query_chromosomes = f"""
5779                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
5780                            FROM {table_variants} as table_variants
5781                            GROUP BY table_variants."#CHROM"
5782                            ORDER BY table_variants."#CHROM"
5783                            """
5784                        sql_query_chromosomes_df = self.conn.execute(
5785                            sql_query_chromosomes
5786                        ).df()
5787                        sql_query_chromosomes_dict = {
5788                            entry["CHROM"]: {
5789                                "count": entry["count_variants"],
5790                                "min": entry["min_variants"],
5791                                "max": entry["max_variants"],
5792                            }
5793                            for index, entry in sql_query_chromosomes_df.iterrows()
5794                        }
5795
5796                        # Init
5797                        nb_of_query = 0
5798                        nb_of_variant_annotated = 0
5799                        query_dict = query_dict_remove
5800
5801                        # for chrom in sql_query_chromosomes_df["CHROM"]:
5802                        for chrom in sql_query_chromosomes_dict:
5803
5804                            # Number of variant by chromosome
5805                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
5806                                chrom, {}
5807                            ).get("count", 0)
5808
5809                            log.debug(
5810                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
5811                            )
5812
5813                            # Annotation with regions database
5814                            if parquet_type in ["regions"]:
5815                                sql_query_annotation_from_clause = f"""
5816                                    FROM (
5817                                        SELECT 
5818                                            '{chrom}' AS \"#CHROM\",
5819                                            table_variants_from.\"POS\" AS \"POS\",
5820                                            {",".join(sql_query_annotation_to_agregate)}
5821                                        FROM {table_variants} as table_variants_from
5822                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
5823                                            table_parquet_from."#CHROM" = '{chrom}'
5824                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
5825                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
5826                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
5827                                                )
5828                                        )
5829                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
5830                                        GROUP BY table_variants_from.\"POS\"
5831                                        )
5832                                        as table_parquet
5833                                """
5834
5835                                sql_query_annotation_where_clause = """
5836                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
5837                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5838                                """
5839
5840                            # Annotation with variants database
5841                            else:
5842                                sql_query_annotation_from_clause = f"""
5843                                    FROM {parquet_file_link} as table_parquet
5844                                """
5845                                sql_query_annotation_where_clause = f"""
5846                                    table_variants."#CHROM" = '{chrom}'
5847                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
5848                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5849                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5850                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5851                                """
5852
5853                            # Create update query
5854                            sql_query_annotation_chrom_interval_pos = f"""
5855                                UPDATE {table_variants} as table_variants
5856                                    SET INFO = 
5857                                        concat(
5858                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5859                                                THEN table_variants.INFO
5860                                                ELSE ''
5861                                            END
5862                                            ,
5863                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5864                                                        AND (
5865                                                        concat({sql_query_annotation_update_info_sets_sql})
5866                                                        )
5867                                                        NOT IN ('','.') 
5868                                                    THEN ';'
5869                                                    ELSE ''
5870                                            END
5871                                            ,
5872                                            {sql_query_annotation_update_info_sets_sql}
5873                                            )
5874                                    {sql_query_annotation_from_clause}
5875                                    WHERE {sql_query_annotation_where_clause}
5876                                    ;
5877                                """
5878
5879                            # Add update query to dict
5880                            query_dict[
5881                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
5882                            ] = sql_query_annotation_chrom_interval_pos
5883
5884                        nb_of_query = len(query_dict)
5885                        num_query = 0
5886
5887                        # SET max_expression_depth TO x
5888                        self.conn.execute("SET max_expression_depth TO 10000")
5889
5890                        for query_name in query_dict:
5891                            query = query_dict[query_name]
5892                            num_query += 1
5893                            log.info(
5894                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
5895                            )
5896                            result = self.conn.execute(query)
5897                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
5898                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
5899                            log.info(
5900                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
5901                            )
5902
5903                        log.info(
5904                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
5905                        )
5906
5907                    else:
5908
5909                        log.info(
5910                            f"Annotation '{annotation_name}' - No Annotations available"
5911                        )
5912
5913                    log.debug("Final header: " + str(vcf_reader.infos))
5914
5915        # Remove added columns
5916        for added_column in added_columns:
5917            self.drop_column(column=added_column)

It takes a VCF file, and annotates it with a parquet file

Parameters
  • threads: number of threads to use for the annotation
Returns

the value of the variable "result".

def annotation_splice(self, threads: int = None) -> None:
5919    def annotation_splice(self, threads: int = None) -> None:
5920        """
5921        This function annotate with snpEff
5922
5923        :param threads: The number of threads to use
5924        :return: the value of the variable "return_value".
5925        """
5926
5927        # DEBUG
5928        log.debug("Start annotation with splice tools")
5929
5930        # Threads
5931        if not threads:
5932            threads = self.get_threads()
5933        log.debug("Threads: " + str(threads))
5934
5935        # DEBUG
5936        delete_tmp = True
5937        if self.get_config().get("verbosity", "warning") in ["debug"]:
5938            delete_tmp = False
5939            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5940
5941        # Config
5942        config = self.get_config()
5943        log.debug("Config: " + str(config))
5944        splice_config = config.get("tools", {}).get("splice", {})
5945        if not splice_config:
5946            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
5947        if not splice_config:
5948            msg_err = "No Splice tool config"
5949            log.error(msg_err)
5950            raise ValueError(msg_err)
5951        log.debug(f"splice_config={splice_config}")
5952
5953        # Config - Folders - Databases
5954        databases_folders = (
5955            config.get("folders", {}).get("databases", {}).get("splice", ["."])
5956        )
5957        log.debug("Databases annotations: " + str(databases_folders))
5958
5959        # Splice docker image
5960        splice_docker_image = splice_config.get("docker").get("image")
5961
5962        # Pull splice image if it's not already there
5963        if not check_docker_image_exists(splice_docker_image):
5964            log.warning(
5965                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
5966            )
5967            try:
5968                command(f"docker pull {splice_config.get('docker').get('image')}")
5969            except subprocess.CalledProcessError:
5970                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
5971                log.error(msg_err)
5972                raise ValueError(msg_err)
5973                return None
5974
5975        # Config - splice databases
5976        splice_databases = (
5977            config.get("folders", {})
5978            .get("databases", {})
5979            .get("splice", DEFAULT_SPLICE_FOLDER)
5980        )
5981        splice_databases = full_path(splice_databases)
5982
5983        # Param
5984        param = self.get_param()
5985        log.debug("Param: " + str(param))
5986
5987        # Param
5988        options = param.get("annotation", {}).get("splice", {})
5989        log.debug("Options: " + str(options))
5990
5991        # Data
5992        table_variants = self.get_table_variants()
5993
5994        # Check if not empty
5995        log.debug("Check if not empty")
5996        sql_query_chromosomes = (
5997            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5998        )
5999        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
6000            log.info("VCF empty")
6001            return None
6002
6003        # Export in VCF
6004        log.debug("Create initial file to annotate")
6005
6006        # Create output folder
6007        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
6008        if not os.path.exists(output_folder):
6009            Path(output_folder).mkdir(parents=True, exist_ok=True)
6010
6011        # Create tmp VCF file
6012        tmp_vcf = NamedTemporaryFile(
6013            prefix=self.get_prefix(),
6014            dir=output_folder,
6015            suffix=".vcf",
6016            delete=False,
6017        )
6018        tmp_vcf_name = tmp_vcf.name
6019
6020        # VCF header
6021        header = self.get_header()
6022
6023        # Existing annotations
6024        for vcf_annotation in self.get_header().infos:
6025
6026            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
6027            log.debug(
6028                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
6029            )
6030
6031        # Memory limit
6032        if config.get("memory", None):
6033            memory_limit = config.get("memory", "8G").upper()
6034            # upper()
6035        else:
6036            memory_limit = "8G"
6037        log.debug(f"memory_limit: {memory_limit}")
6038
6039        # Check number of variants to annotate
6040        where_clause_regex_spliceai = r"SpliceAI_\w+"
6041        where_clause_regex_spip = r"SPiP_\w+"
6042        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
6043        df_list_of_variants_to_annotate = self.get_query_to_df(
6044            query=f""" SELECT * FROM variants {where_clause} """
6045        )
6046        if len(df_list_of_variants_to_annotate) == 0:
6047            log.warning(
6048                f"No variants to annotate with splice. Variants probably already annotated with splice"
6049            )
6050            return None
6051        else:
6052            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
6053
6054        # Export VCF file
6055        self.export_variant_vcf(
6056            vcf_file=tmp_vcf_name,
6057            remove_info=True,
6058            add_samples=True,
6059            index=False,
6060            where_clause=where_clause,
6061        )
6062
6063        # Create docker container and launch splice analysis
6064        if splice_config:
6065
6066            # Splice mount folders
6067            mount_folders = splice_config.get("mount", {})
6068
6069            # Genome mount
6070            mount_folders[
6071                config.get("folders", {})
6072                .get("databases", {})
6073                .get("genomes", DEFAULT_GENOME_FOLDER)
6074            ] = "ro"
6075
6076            # SpliceAI mount
6077            mount_folders[
6078                config.get("folders", {})
6079                .get("databases", {})
6080                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
6081            ] = "ro"
6082
6083            # Genome mount
6084            mount_folders[
6085                config.get("folders", {})
6086                .get("databases", {})
6087                .get("spip", DEFAULT_SPIP_FOLDER)
6088            ] = "ro"
6089
6090            # Mount folders
6091            mount = []
6092
6093            # Config mount
6094            mount = [
6095                f"-v {full_path(path)}:{full_path(path)}:{mode}"
6096                for path, mode in mount_folders.items()
6097            ]
6098
6099            if any(value for value in splice_config.values() if value is None):
6100                log.warning("At least one splice config parameter is empty")
6101                return None
6102
6103            # Params in splice nf
6104            def check_values(dico: dict):
6105                """
6106                Ensure parameters for NF splice pipeline
6107                """
6108                for key, val in dico.items():
6109                    if key == "genome":
6110                        if any(
6111                            assemb in options.get("genome", {})
6112                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6113                        ):
6114                            yield f"--{key} hg19"
6115                        elif any(
6116                            assemb in options.get("genome", {})
6117                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6118                        ):
6119                            yield f"--{key} hg38"
6120                    elif (
6121                        (isinstance(val, str) and val)
6122                        or isinstance(val, int)
6123                        or isinstance(val, bool)
6124                    ):
6125                        yield f"--{key} {val}"
6126
6127            # Genome
6128            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6129            options["genome"] = genome
6130
6131            # NF params
6132            nf_params = []
6133
6134            # Add options
6135            if options:
6136                nf_params = list(check_values(options))
6137                log.debug(f"Splice NF params: {' '.join(nf_params)}")
6138            else:
6139                log.debug("No NF params provided")
6140
6141            # Add threads
6142            if "threads" not in options.keys():
6143                nf_params.append(f"--threads {threads}")
6144
6145            # Genome path
6146            genome_path = find_genome(
6147                config.get("folders", {})
6148                .get("databases", {})
6149                .get("genomes", DEFAULT_GENOME_FOLDER),
6150                file=f"{genome}.fa",
6151            )
6152            # Add genome path
6153            if not genome_path:
6154                raise ValueError(
6155                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6156                )
6157            else:
6158                log.debug(f"Genome: {genome_path}")
6159                nf_params.append(f"--genome_path {genome_path}")
6160
6161            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6162                """
6163                Setting up updated databases for SPiP and SpliceAI
6164                """
6165
6166                try:
6167
6168                    # SpliceAI assembly transcriptome
6169                    spliceai_assembly = os.path.join(
6170                        config.get("folders", {})
6171                        .get("databases", {})
6172                        .get("spliceai", {}),
6173                        options.get("genome"),
6174                        "transcriptome",
6175                    )
6176                    spip_assembly = options.get("genome")
6177
6178                    spip = find(
6179                        f"transcriptome_{spip_assembly}.RData",
6180                        config.get("folders", {}).get("databases", {}).get("spip", {}),
6181                    )
6182                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6183                    log.debug(f"SPiP annotations: {spip}")
6184                    log.debug(f"SpliceAI annotations: {spliceai}")
6185                    if spip and spliceai:
6186                        return [
6187                            f"--spip_transcriptome {spip}",
6188                            f"--spliceai_annotations {spliceai}",
6189                        ]
6190                    else:
6191                        # TODO crash and go on with basic annotations ?
6192                        # raise ValueError(
6193                        #     "Can't find splice databases in configuration EXIT"
6194                        # )
6195                        log.warning(
6196                            "Can't find splice databases in configuration, use annotations file from image"
6197                        )
6198                except TypeError:
6199                    log.warning(
6200                        "Can't find splice databases in configuration, use annotations file from image"
6201                    )
6202                    return []
6203
6204            # Add options, check if transcriptome option have already beend provided
6205            if (
6206                "spip_transcriptome" not in nf_params
6207                and "spliceai_transcriptome" not in nf_params
6208            ):
6209                splice_reference = splice_annotations(options, config)
6210                if splice_reference:
6211                    nf_params.extend(splice_reference)
6212
6213            nf_params.append(f"--output_folder {output_folder}")
6214
6215            random_uuid = f"HOWARD-SPLICE-{get_random()}"
6216            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6217            log.debug(cmd)
6218
6219            splice_config["docker"]["command"] = cmd
6220
6221            docker_cmd = get_bin_command(
6222                tool="splice",
6223                bin_type="docker",
6224                config=config,
6225                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6226                add_options=f"--name {random_uuid} {' '.join(mount)}",
6227            )
6228
6229            # Docker debug
6230            # if splice_config.get("rm_container"):
6231            #     rm_container = "--rm"
6232            # else:
6233            #     rm_container = ""
6234            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6235
6236            log.debug(docker_cmd)
6237            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6238            log.debug(res.stdout)
6239            if res.stderr:
6240                log.error(res.stderr)
6241            res.check_returncode()
6242        else:
6243            log.warning(f"Splice tool configuration not found: {config}")
6244
6245        # Update variants
6246        log.info("Annotation - Updating...")
6247        # Test find output vcf
6248        log.debug(
6249            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6250        )
6251        output_vcf = []
6252        # Wrong folder to look in
6253        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6254            if (
6255                files
6256                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6257            ):
6258                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6259        # log.debug(os.listdir(options.get("output_folder")))
6260        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6261        if not output_vcf:
6262            log.debug(
6263                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6264            )
6265        else:
6266            # Get new header from annotated vcf
6267            log.debug(f"Initial header: {len(header.infos)} fields")
6268            # Create new header with splice infos
6269            new_vcf = Variants(input=output_vcf[0])
6270            new_vcf_header = new_vcf.get_header().infos
6271            for keys, infos in new_vcf_header.items():
6272                if keys not in header.infos.keys():
6273                    header.infos[keys] = infos
6274            log.debug(f"New header: {len(header.infos)} fields")
6275            log.debug(f"Splice tmp output: {output_vcf[0]}")
6276            self.update_from_vcf(output_vcf[0])
6277
6278        # Remove folder
6279        remove_if_exists(output_folder)

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def get_config_default(self, name: str) -> dict:
6285    def get_config_default(self, name: str) -> dict:
6286        """
6287        The function `get_config_default` returns a dictionary containing default configurations for
6288        various calculations and prioritizations.
6289
6290        :param name: The `get_config_default` function returns a dictionary containing default
6291        configurations for different calculations and prioritizations. The `name` parameter is used to
6292        specify which specific configuration to retrieve from the dictionary
6293        :type name: str
6294        :return: The function `get_config_default` returns a dictionary containing default configuration
6295        settings for different calculations and prioritizations. The specific configuration settings are
6296        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6297        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6298        returned. If there is no match, an empty dictionary is returned.
6299        """
6300
6301        config_default = {
6302            "calculations": {
6303                "variant_chr_pos_alt_ref": {
6304                    "type": "sql",
6305                    "name": "variant_chr_pos_alt_ref",
6306                    "description": "Create a variant ID with chromosome, position, alt and ref",
6307                    "available": False,
6308                    "output_column_name": "variant_chr_pos_alt_ref",
6309                    "output_column_type": "String",
6310                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6311                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6312                    "operation_info": True,
6313                },
6314                "VARTYPE": {
6315                    "type": "sql",
6316                    "name": "VARTYPE",
6317                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6318                    "available": True,
6319                    "output_column_name": "VARTYPE",
6320                    "output_column_type": "String",
6321                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6322                    "operation_query": """
6323                            CASE
6324                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6325                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6326                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6327                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6328                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6329                                ELSE 'UNDEFINED'
6330                            END
6331                            """,
6332                    "info_fields": ["SVTYPE"],
6333                    "operation_info": True,
6334                },
6335                "snpeff_hgvs": {
6336                    "type": "python",
6337                    "name": "snpeff_hgvs",
6338                    "description": "HGVS nomenclatures from snpEff annotation",
6339                    "available": True,
6340                    "function_name": "calculation_extract_snpeff_hgvs",
6341                    "function_params": ["snpeff_hgvs", "ANN"],
6342                },
6343                "snpeff_ann_explode": {
6344                    "type": "python",
6345                    "name": "snpeff_ann_explode",
6346                    "description": "Explode snpEff annotations with uniquify values",
6347                    "available": True,
6348                    "function_name": "calculation_snpeff_ann_explode",
6349                    "function_params": [False, "fields", "snpeff_", "ANN"],
6350                },
6351                "snpeff_ann_explode_uniquify": {
6352                    "type": "python",
6353                    "name": "snpeff_ann_explode_uniquify",
6354                    "description": "Explode snpEff annotations",
6355                    "available": True,
6356                    "function_name": "calculation_snpeff_ann_explode",
6357                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
6358                },
6359                "snpeff_ann_explode_json": {
6360                    "type": "python",
6361                    "name": "snpeff_ann_explode_json",
6362                    "description": "Explode snpEff annotations in JSON format",
6363                    "available": True,
6364                    "function_name": "calculation_snpeff_ann_explode",
6365                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
6366                },
6367                "NOMEN": {
6368                    "type": "python",
6369                    "name": "NOMEN",
6370                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
6371                    "available": True,
6372                    "function_name": "calculation_extract_nomen",
6373                    "function_params": [],
6374                },
6375                "FINDBYPIPELINE": {
6376                    "type": "python",
6377                    "name": "FINDBYPIPELINE",
6378                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6379                    "available": True,
6380                    "function_name": "calculation_find_by_pipeline",
6381                    "function_params": ["findbypipeline"],
6382                },
6383                "FINDBYSAMPLE": {
6384                    "type": "python",
6385                    "name": "FINDBYSAMPLE",
6386                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6387                    "available": True,
6388                    "function_name": "calculation_find_by_pipeline",
6389                    "function_params": ["findbysample"],
6390                },
6391                "GENOTYPECONCORDANCE": {
6392                    "type": "python",
6393                    "name": "GENOTYPECONCORDANCE",
6394                    "description": "Concordance of genotype for multi caller VCF",
6395                    "available": True,
6396                    "function_name": "calculation_genotype_concordance",
6397                    "function_params": [],
6398                },
6399                "BARCODE": {
6400                    "type": "python",
6401                    "name": "BARCODE",
6402                    "description": "BARCODE as VaRank tool",
6403                    "available": True,
6404                    "function_name": "calculation_barcode",
6405                    "function_params": [],
6406                },
6407                "BARCODEFAMILY": {
6408                    "type": "python",
6409                    "name": "BARCODEFAMILY",
6410                    "description": "BARCODEFAMILY as VaRank tool",
6411                    "available": True,
6412                    "function_name": "calculation_barcode_family",
6413                    "function_params": ["BCF"],
6414                },
6415                "TRIO": {
6416                    "type": "python",
6417                    "name": "TRIO",
6418                    "description": "Inheritance for a trio family",
6419                    "available": True,
6420                    "function_name": "calculation_trio",
6421                    "function_params": [],
6422                },
6423                "VAF": {
6424                    "type": "python",
6425                    "name": "VAF",
6426                    "description": "Variant Allele Frequency (VAF) harmonization",
6427                    "available": True,
6428                    "function_name": "calculation_vaf_normalization",
6429                    "function_params": [],
6430                },
6431                "VAF_stats": {
6432                    "type": "python",
6433                    "name": "VAF_stats",
6434                    "description": "Variant Allele Frequency (VAF) statistics",
6435                    "available": True,
6436                    "function_name": "calculation_genotype_stats",
6437                    "function_params": ["VAF"],
6438                },
6439                "DP_stats": {
6440                    "type": "python",
6441                    "name": "DP_stats",
6442                    "description": "Depth (DP) statistics",
6443                    "available": True,
6444                    "function_name": "calculation_genotype_stats",
6445                    "function_params": ["DP"],
6446                },
6447                "variant_id": {
6448                    "type": "python",
6449                    "name": "variant_id",
6450                    "description": "Variant ID generated from variant position and type",
6451                    "available": True,
6452                    "function_name": "calculation_variant_id",
6453                    "function_params": [],
6454                },
6455                "transcripts_json": {
6456                    "type": "python",
6457                    "name": "transcripts_json",
6458                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
6459                    "available": True,
6460                    "function_name": "calculation_transcripts_annotation",
6461                    "function_params": ["transcripts_json", None],
6462                },
6463                "transcripts_ann": {
6464                    "type": "python",
6465                    "name": "transcripts_ann",
6466                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
6467                    "available": True,
6468                    "function_name": "calculation_transcripts_annotation",
6469                    "function_params": [None, "transcripts_ann"],
6470                },
6471                "transcripts_annotations": {
6472                    "type": "python",
6473                    "name": "transcripts_annotations",
6474                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
6475                    "available": True,
6476                    "function_name": "calculation_transcripts_annotation",
6477                    "function_params": [None, None],
6478                },
6479                "transcripts_prioritization": {
6480                    "type": "python",
6481                    "name": "transcripts_prioritization",
6482                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
6483                    "available": True,
6484                    "function_name": "calculation_transcripts_prioritization",
6485                    "function_params": [],
6486                },
6487            },
6488            "prioritizations": {
6489                "default": {
6490                    "filter": [
6491                        {
6492                            "type": "notequals",
6493                            "value": "!PASS|\\.",
6494                            "score": 0,
6495                            "flag": "FILTERED",
6496                            "comment": ["Bad variant quality"],
6497                        },
6498                        {
6499                            "type": "equals",
6500                            "value": "REJECT",
6501                            "score": -20,
6502                            "flag": "PASS",
6503                            "comment": ["Bad variant quality"],
6504                        },
6505                    ],
6506                    "DP": [
6507                        {
6508                            "type": "gte",
6509                            "value": "50",
6510                            "score": 5,
6511                            "flag": "PASS",
6512                            "comment": ["DP higher than 50"],
6513                        }
6514                    ],
6515                    "ANN": [
6516                        {
6517                            "type": "contains",
6518                            "value": "HIGH",
6519                            "score": 5,
6520                            "flag": "PASS",
6521                            "comment": [
6522                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6523                            ],
6524                        },
6525                        {
6526                            "type": "contains",
6527                            "value": "MODERATE",
6528                            "score": 3,
6529                            "flag": "PASS",
6530                            "comment": [
6531                                "A non-disruptive variant that might change protein effectiveness"
6532                            ],
6533                        },
6534                        {
6535                            "type": "contains",
6536                            "value": "LOW",
6537                            "score": 0,
6538                            "flag": "FILTERED",
6539                            "comment": [
6540                                "Assumed to be mostly harmless or unlikely to change protein behavior"
6541                            ],
6542                        },
6543                        {
6544                            "type": "contains",
6545                            "value": "MODIFIER",
6546                            "score": 0,
6547                            "flag": "FILTERED",
6548                            "comment": [
6549                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
6550                            ],
6551                        },
6552                    ],
6553                }
6554            },
6555        }
6556
6557        return config_default.get(name, None)

The function get_config_default returns a dictionary containing default configurations for various calculations and prioritizations.

Parameters
  • name: The get_config_default function returns a dictionary containing default configurations for different calculations and prioritizations. The name parameter is used to specify which specific configuration to retrieve from the dictionary
Returns

The function get_config_default returns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the input name parameter provided to the function. If the name parameter matches a key in the config_default dictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.

def get_config_json(self, name: str, config_dict: dict = {}, config_file: str = None) -> dict:
6559    def get_config_json(
6560        self, name: str, config_dict: dict = {}, config_file: str = None
6561    ) -> dict:
6562        """
6563        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
6564        default values, a dictionary, and a file.
6565
6566        :param name: The `name` parameter in the `get_config_json` function is a string that represents
6567        the name of the configuration. It is used to identify and retrieve the configuration settings
6568        for a specific component or module
6569        :type name: str
6570        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
6571        dictionary that allows you to provide additional configuration settings or overrides. When you
6572        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
6573        the key is the configuration setting you want to override or
6574        :type config_dict: dict
6575        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
6576        specify the path to a configuration file that contains additional settings. If provided, the
6577        function will read the contents of this file and update the configuration dictionary with the
6578        values found in the file, overriding any existing values with the
6579        :type config_file: str
6580        :return: The function `get_config_json` returns a dictionary containing the configuration
6581        settings.
6582        """
6583
6584        # Create with default prioritizations
6585        config_default = self.get_config_default(name=name)
6586        configuration = config_default
6587        # log.debug(f"configuration={configuration}")
6588
6589        # Replace prioritizations from dict
6590        for config in config_dict:
6591            configuration[config] = config_dict[config]
6592
6593        # Replace prioritizations from file
6594        config_file = full_path(config_file)
6595        if config_file:
6596            if os.path.exists(config_file):
6597                with open(config_file) as config_file_content:
6598                    config_file_dict = json.load(config_file_content)
6599                for config in config_file_dict:
6600                    configuration[config] = config_file_dict[config]
6601            else:
6602                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
6603                log.error(msg_error)
6604                raise ValueError(msg_error)
6605
6606        return configuration

The function get_config_json retrieves a configuration JSON object with prioritizations from default values, a dictionary, and a file.

Parameters
  • name: The name parameter in the get_config_json function is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module
  • config_dict: The config_dict parameter in the get_config_json function is a dictionary that allows you to provide additional configuration settings or overrides. When you call the get_config_json function, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or
  • config_file: The config_file parameter in the get_config_json function is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns

The function get_config_json returns a dictionary containing the configuration settings.

def prioritization( self, table: str = None, pz_prefix: str = None, pz_param: dict = None) -> bool:
6608    def prioritization(
6609        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
6610    ) -> bool:
6611        """
6612        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
6613        prioritizes variants based on configured profiles and criteria.
6614
6615        :param table: The `table` parameter in the `prioritization` function is used to specify the name
6616        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
6617        a table name is provided, the method will prioritize the variants in that specific table
6618        :type table: str
6619        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
6620        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
6621        provided, the code will use a default prefix value of "PZ"
6622        :type pz_prefix: str
6623        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
6624        additional parameters specific to the prioritization process. These parameters can include
6625        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
6626        configurations needed for the prioritization of variants in a V
6627        :type pz_param: dict
6628        :return: A boolean value (True) is being returned from the `prioritization` function.
6629        """
6630
6631        # Config
6632        config = self.get_config()
6633
6634        # Param
6635        param = self.get_param()
6636
6637        # Prioritization param
6638        if pz_param is not None:
6639            prioritization_param = pz_param
6640        else:
6641            prioritization_param = param.get("prioritization", {})
6642
6643        # Configuration profiles
6644        prioritization_config_file = prioritization_param.get(
6645            "prioritization_config", None
6646        )
6647        prioritization_config_file = full_path(prioritization_config_file)
6648        prioritizations_config = self.get_config_json(
6649            name="prioritizations", config_file=prioritization_config_file
6650        )
6651
6652        # Prioritization prefix
6653        pz_prefix_default = "PZ"
6654        if pz_prefix is None:
6655            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
6656
6657        # Prioritization options
6658        profiles = prioritization_param.get("profiles", [])
6659        if isinstance(profiles, str):
6660            profiles = profiles.split(",")
6661        pzfields = prioritization_param.get(
6662            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
6663        )
6664        if isinstance(pzfields, str):
6665            pzfields = pzfields.split(",")
6666        default_profile = prioritization_param.get("default_profile", None)
6667        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
6668        prioritization_score_mode = prioritization_param.get(
6669            "prioritization_score_mode", "HOWARD"
6670        )
6671
6672        # Quick Prioritizations
6673        prioritizations = param.get("prioritizations", None)
6674        if prioritizations:
6675            log.info("Quick Prioritization:")
6676            for profile in prioritizations.split(","):
6677                if profile not in profiles:
6678                    profiles.append(profile)
6679                    log.info(f"   {profile}")
6680
6681        # If profile "ALL" provided, all profiles in the config profiles
6682        if "ALL" in profiles:
6683            profiles = list(prioritizations_config.keys())
6684
6685        for profile in profiles:
6686            if prioritizations_config.get(profile, None):
6687                log.debug(f"Profile '{profile}' configured")
6688            else:
6689                msg_error = f"Profile '{profile}' NOT configured"
6690                log.error(msg_error)
6691                raise ValueError(msg_error)
6692
6693        if profiles:
6694            log.info(f"Prioritization... ")
6695        else:
6696            log.debug(f"No profile defined")
6697            return False
6698
6699        if not default_profile and len(profiles):
6700            default_profile = profiles[0]
6701
6702        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
6703        log.debug("Profiles to check: " + str(list(profiles)))
6704
6705        # Variables
6706        if table is not None:
6707            table_variants = table
6708        else:
6709            table_variants = self.get_table_variants(clause="update")
6710        log.debug(f"Table to prioritize: {table_variants}")
6711
6712        # Added columns
6713        added_columns = []
6714
6715        # Create list of PZfields
6716        # List of PZFields
6717        list_of_pzfields_original = pzfields + [
6718            pzfield + pzfields_sep + profile
6719            for pzfield in pzfields
6720            for profile in profiles
6721        ]
6722        list_of_pzfields = []
6723        log.debug(f"{list_of_pzfields_original}")
6724
6725        # Remove existing PZfields to use if exists
6726        for pzfield in list_of_pzfields_original:
6727            if self.get_header().infos.get(pzfield, None) is None:
6728                list_of_pzfields.append(pzfield)
6729                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
6730            else:
6731                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
6732
6733        if list_of_pzfields:
6734
6735            # Explode Infos prefix
6736            explode_infos_prefix = self.get_explode_infos_prefix()
6737
6738            # PZfields tags description
6739            PZfields_INFOS = {
6740                f"{pz_prefix}Tags": {
6741                    "ID": f"{pz_prefix}Tags",
6742                    "Number": ".",
6743                    "Type": "String",
6744                    "Description": "Variant tags based on annotation criteria",
6745                },
6746                f"{pz_prefix}Score": {
6747                    "ID": f"{pz_prefix}Score",
6748                    "Number": 1,
6749                    "Type": "Integer",
6750                    "Description": "Variant score based on annotation criteria",
6751                },
6752                f"{pz_prefix}Flag": {
6753                    "ID": f"{pz_prefix}Flag",
6754                    "Number": 1,
6755                    "Type": "String",
6756                    "Description": "Variant flag based on annotation criteria",
6757                },
6758                f"{pz_prefix}Comment": {
6759                    "ID": f"{pz_prefix}Comment",
6760                    "Number": ".",
6761                    "Type": "String",
6762                    "Description": "Variant comment based on annotation criteria",
6763                },
6764                f"{pz_prefix}Infos": {
6765                    "ID": f"{pz_prefix}Infos",
6766                    "Number": ".",
6767                    "Type": "String",
6768                    "Description": "Variant infos based on annotation criteria",
6769                },
6770            }
6771
6772            # Create INFO fields if not exist
6773            for field in PZfields_INFOS:
6774                field_ID = PZfields_INFOS[field]["ID"]
6775                field_description = PZfields_INFOS[field]["Description"]
6776                if field_ID not in self.get_header().infos and field_ID in pzfields:
6777                    field_description = (
6778                        PZfields_INFOS[field]["Description"]
6779                        + f", profile {default_profile}"
6780                    )
6781                    self.get_header().infos[field_ID] = vcf.parser._Info(
6782                        field_ID,
6783                        PZfields_INFOS[field]["Number"],
6784                        PZfields_INFOS[field]["Type"],
6785                        field_description,
6786                        "unknown",
6787                        "unknown",
6788                        code_type_map[PZfields_INFOS[field]["Type"]],
6789                    )
6790
6791            # Create INFO fields if not exist for each profile
6792            for profile in prioritizations_config:
6793                if profile in profiles or profiles == []:
6794                    for field in PZfields_INFOS:
6795                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
6796                        field_description = (
6797                            PZfields_INFOS[field]["Description"]
6798                            + f", profile {profile}"
6799                        )
6800                        if (
6801                            field_ID not in self.get_header().infos
6802                            and field in pzfields
6803                        ):
6804                            self.get_header().infos[field_ID] = vcf.parser._Info(
6805                                field_ID,
6806                                PZfields_INFOS[field]["Number"],
6807                                PZfields_INFOS[field]["Type"],
6808                                field_description,
6809                                "unknown",
6810                                "unknown",
6811                                code_type_map[PZfields_INFOS[field]["Type"]],
6812                            )
6813
6814            # Header
6815            for pzfield in list_of_pzfields:
6816                if re.match(f"{pz_prefix}Score.*", pzfield):
6817                    added_column = self.add_column(
6818                        table_name=table_variants,
6819                        column_name=pzfield,
6820                        column_type="INTEGER",
6821                        default_value="0",
6822                    )
6823                elif re.match(f"{pz_prefix}Flag.*", pzfield):
6824                    added_column = self.add_column(
6825                        table_name=table_variants,
6826                        column_name=pzfield,
6827                        column_type="BOOLEAN",
6828                        default_value="1",
6829                    )
6830                else:
6831                    added_column = self.add_column(
6832                        table_name=table_variants,
6833                        column_name=pzfield,
6834                        column_type="STRING",
6835                        default_value="''",
6836                    )
6837                added_columns.append(added_column)
6838
6839            # Profiles
6840            if profiles:
6841
6842                # foreach profile in configuration file
6843                for profile in prioritizations_config:
6844
6845                    # If profile is asked in param, or ALL are asked (empty profile [])
6846                    if profile in profiles or profiles == []:
6847                        log.info(f"Profile '{profile}'")
6848
6849                        sql_set_info_option = ""
6850
6851                        sql_set_info = []
6852
6853                        # PZ fields set
6854
6855                        # PZScore
6856                        if (
6857                            f"{pz_prefix}Score{pzfields_sep}{profile}"
6858                            in list_of_pzfields
6859                        ):
6860                            sql_set_info.append(
6861                                f"""
6862                                    concat(
6863                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
6864                                        {pz_prefix}Score{pzfields_sep}{profile}
6865                                    ) 
6866                                """
6867                            )
6868                            if (
6869                                profile == default_profile
6870                                and f"{pz_prefix}Score" in list_of_pzfields
6871                            ):
6872                                sql_set_info.append(
6873                                    f"""
6874                                        concat(
6875                                            '{pz_prefix}Score=',
6876                                            {pz_prefix}Score{pzfields_sep}{profile}
6877                                        )
6878                                    """
6879                                )
6880
6881                        # PZFlag
6882                        if (
6883                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
6884                            in list_of_pzfields
6885                        ):
6886                            sql_set_info.append(
6887                                f"""
6888                                    concat(
6889                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
6890                                        CASE 
6891                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
6892                                            THEN 'PASS'
6893                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
6894                                            THEN 'FILTERED'
6895                                        END
6896                                    ) 
6897                                """
6898                            )
6899                            if (
6900                                profile == default_profile
6901                                and f"{pz_prefix}Flag" in list_of_pzfields
6902                            ):
6903                                sql_set_info.append(
6904                                    f"""
6905                                        concat(
6906                                            '{pz_prefix}Flag=',
6907                                            CASE 
6908                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
6909                                                THEN 'PASS'
6910                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
6911                                                THEN 'FILTERED'
6912                                            END
6913                                        )
6914                                    """
6915                                )
6916
6917                        # PZComment
6918                        if (
6919                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
6920                            in list_of_pzfields
6921                        ):
6922                            sql_set_info.append(
6923                                f"""
6924                                    CASE
6925                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
6926                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
6927                                        ELSE ''
6928                                    END
6929                                """
6930                            )
6931                            if (
6932                                profile == default_profile
6933                                and f"{pz_prefix}Comment" in list_of_pzfields
6934                            ):
6935                                sql_set_info.append(
6936                                    f"""
6937                                        CASE
6938                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
6939                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
6940                                            ELSE ''
6941                                        END
6942                                    """
6943                                )
6944
6945                        # PZInfos
6946                        if (
6947                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
6948                            in list_of_pzfields
6949                        ):
6950                            sql_set_info.append(
6951                                f"""
6952                                    CASE
6953                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
6954                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
6955                                        ELSE ''
6956                                    END
6957                                """
6958                            )
6959                            if (
6960                                profile == default_profile
6961                                and f"{pz_prefix}Infos" in list_of_pzfields
6962                            ):
6963                                sql_set_info.append(
6964                                    f"""
6965                                        CASE
6966                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
6967                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
6968                                            ELSE ''
6969                                        END
6970                                    """
6971                                )
6972
6973                        # Merge PZfields
6974                        sql_set_info_option = ""
6975                        sql_set_sep = ""
6976                        for sql_set in sql_set_info:
6977                            if sql_set_sep:
6978                                sql_set_info_option += f"""
6979                                    , concat('{sql_set_sep}', {sql_set})
6980                                """
6981                            else:
6982                                sql_set_info_option += f"""
6983                                    , {sql_set}
6984                                """
6985                            sql_set_sep = ";"
6986
6987                        sql_queries = []
6988                        for annotation in prioritizations_config[profile]:
6989
6990                            # Explode specific annotation
6991                            log.debug(f"Explode annotation '{annotation}'")
6992                            added_columns += self.explode_infos(
6993                                prefix=explode_infos_prefix,
6994                                fields=[annotation],
6995                                table=table_variants,
6996                            )
6997                            extra_infos = self.get_extra_infos(table=table_variants)
6998
6999                            # Check if annotation field is present
7000                            if not f"{explode_infos_prefix}{annotation}" in extra_infos:
7001                                log.debug(f"Annotation '{annotation}' not in data")
7002                                continue
7003                            else:
7004                                log.debug(f"Annotation '{annotation}' in data")
7005
7006                            # For each criterions
7007                            for criterion in prioritizations_config[profile][
7008                                annotation
7009                            ]:
7010                                criterion_type = criterion["type"]
7011                                criterion_value = criterion["value"]
7012                                criterion_score = criterion.get("score", 0)
7013                                criterion_flag = criterion.get("flag", "PASS")
7014                                criterion_flag_bool = criterion_flag == "PASS"
7015                                criterion_comment = (
7016                                    ", ".join(criterion.get("comment", []))
7017                                    .replace("'", "''")
7018                                    .replace(";", ",")
7019                                    .replace("\t", " ")
7020                                )
7021                                criterion_infos = (
7022                                    str(criterion)
7023                                    .replace("'", "''")
7024                                    .replace(";", ",")
7025                                    .replace("\t", " ")
7026                                )
7027
7028                                sql_set = []
7029                                sql_set_info = []
7030
7031                                # PZ fields set
7032                                if (
7033                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
7034                                    in list_of_pzfields
7035                                ):
7036                                    if prioritization_score_mode == "HOWARD":
7037                                        sql_set.append(
7038                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
7039                                        )
7040                                    elif prioritization_score_mode == "VaRank":
7041                                        sql_set.append(
7042                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
7043                                        )
7044                                    else:
7045                                        sql_set.append(
7046                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
7047                                        )
7048                                if (
7049                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
7050                                    in list_of_pzfields
7051                                ):
7052                                    sql_set.append(
7053                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
7054                                    )
7055                                if (
7056                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
7057                                    in list_of_pzfields
7058                                ):
7059                                    sql_set.append(
7060                                        f"""
7061                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
7062                                                concat(
7063                                                    {pz_prefix}Comment{pzfields_sep}{profile},
7064                                                    CASE 
7065                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
7066                                                        THEN ', '
7067                                                        ELSE ''
7068                                                    END,
7069                                                    '{criterion_comment}'
7070                                                )
7071                                        """
7072                                    )
7073                                if (
7074                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
7075                                    in list_of_pzfields
7076                                ):
7077                                    sql_set.append(
7078                                        f"""
7079                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
7080                                                concat(
7081                                                    {pz_prefix}Infos{pzfields_sep}{profile},
7082                                                    '{criterion_infos}'
7083                                                )
7084                                        """
7085                                    )
7086                                sql_set_option = ",".join(sql_set)
7087
7088                                # Criterion and comparison
7089                                if sql_set_option:
7090                                    try:
7091                                        float(criterion_value)
7092                                        sql_update = f"""
7093                                            UPDATE {table_variants}
7094                                            SET {sql_set_option}
7095                                            WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
7096                                            AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
7097                                            """
7098                                    except:
7099                                        contains_option = ""
7100                                        if criterion_type == "contains":
7101                                            contains_option = ".*"
7102                                        sql_update = f"""
7103                                            UPDATE {table_variants}
7104                                            SET {sql_set_option}
7105                                            WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
7106                                            """
7107                                    sql_queries.append(sql_update)
7108                                else:
7109                                    log.warning(
7110                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
7111                                    )
7112
7113                        # PZTags
7114                        if (
7115                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
7116                            in list_of_pzfields
7117                        ):
7118
7119                            # Create PZFalgs value
7120                            pztags_value = ""
7121                            pztags_sep_default = "|"
7122                            pztags_sep = ""
7123                            for pzfield in pzfields:
7124                                if pzfield not in [f"{pz_prefix}Tags"]:
7125                                    if (
7126                                        f"{pzfield}{pzfields_sep}{profile}"
7127                                        in list_of_pzfields
7128                                    ):
7129                                        if pzfield in [f"{pz_prefix}Flag"]:
7130                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7131                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
7132                                                    THEN 'PASS'
7133                                                    ELSE 'FILTERED'
7134                                                END, '"""
7135                                        else:
7136                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
7137                                        pztags_sep = pztags_sep_default
7138
7139                            # Add Query update for PZFlags
7140                            sql_update_pztags = f"""
7141                                UPDATE {table_variants}
7142                                SET INFO = concat(
7143                                        INFO,
7144                                        CASE WHEN INFO NOT in ('','.')
7145                                                THEN ';'
7146                                                ELSE ''
7147                                        END,
7148                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
7149                                    )
7150                                """
7151                            sql_queries.append(sql_update_pztags)
7152
7153                            # Add Query update for PZFlags for default
7154                            if profile == default_profile:
7155                                sql_update_pztags_default = f"""
7156                                UPDATE {table_variants}
7157                                SET INFO = concat(
7158                                        INFO,
7159                                        ';',
7160                                        '{pz_prefix}Tags={pztags_value}'
7161                                    )
7162                                """
7163                                sql_queries.append(sql_update_pztags_default)
7164
7165                        log.info(f"""Profile '{profile}' - Prioritization... """)
7166
7167                        if sql_queries:
7168
7169                            for sql_query in sql_queries:
7170                                log.debug(
7171                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
7172                                )
7173                                self.conn.execute(sql_query)
7174
7175                        log.info(f"""Profile '{profile}' - Update... """)
7176                        sql_query_update = f"""
7177                            UPDATE {table_variants}
7178                            SET INFO =  
7179                                concat(
7180                                    CASE
7181                                        WHEN INFO NOT IN ('','.')
7182                                        THEN concat(INFO, ';')
7183                                        ELSE ''
7184                                    END
7185                                    {sql_set_info_option}
7186                                )
7187                        """
7188                        self.conn.execute(sql_query_update)
7189
7190        else:
7191
7192            log.warning(f"No profiles in parameters")
7193
7194        # Remove added columns
7195        for added_column in added_columns:
7196            self.drop_column(column=added_column)
7197
7198        # Explode INFOS fields into table fields
7199        if self.get_explode_infos():
7200            self.explode_infos(
7201                prefix=self.get_explode_infos_prefix(),
7202                fields=self.get_explode_infos_fields(),
7203                force=True,
7204            )
7205
7206        return True

The prioritization function in Python processes VCF files, adds new INFO fields, and prioritizes variants based on configured profiles and criteria.

Parameters
  • table: The table parameter in the prioritization function is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table
  • pz_prefix: The pz_prefix parameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ"
  • pz_param: The pz_param parameter in the prioritization method is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns

A boolean value (True) is being returned from the prioritization function.

def annotation_hgvs(self, threads: int = None) -> None:
7212    def annotation_hgvs(self, threads: int = None) -> None:
7213        """
7214        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7215        coordinates and alleles.
7216
7217        :param threads: The `threads` parameter is an optional integer that specifies the number of
7218        threads to use for parallel processing. If no value is provided, it will default to the number
7219        of threads obtained from the `get_threads()` method
7220        :type threads: int
7221        """
7222
7223        # Function for each partition of the Dask Dataframe
7224        def partition_function(partition):
7225            """
7226            The function `partition_function` applies the `annotation_hgvs_partition` function to
7227            each row of a DataFrame called `partition`.
7228
7229            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7230            to be processed
7231            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7232            the "partition" dataframe along the axis 1.
7233            """
7234            return partition.apply(annotation_hgvs_partition, axis=1)
7235
7236        def annotation_hgvs_partition(row) -> str:
7237            """
7238            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7239            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7240
7241            :param row: A dictionary-like object that contains the values for the following keys:
7242            :return: a string that contains the HGVS names associated with the given row of data.
7243            """
7244
7245            chr = row["CHROM"]
7246            pos = row["POS"]
7247            ref = row["REF"]
7248            alt = row["ALT"]
7249
7250            # Find list of associated transcripts
7251            transcripts_list = list(
7252                polars_conn.execute(
7253                    f"""
7254                SELECT transcript
7255                FROM refseq_df
7256                WHERE CHROM='{chr}'
7257                AND POS={pos}
7258            """
7259                )["transcript"]
7260            )
7261
7262            # Full HGVS annotation in list
7263            hgvs_full_list = []
7264
7265            for transcript_name in transcripts_list:
7266
7267                # Transcript
7268                transcript = get_transcript(
7269                    transcripts=transcripts, transcript_name=transcript_name
7270                )
7271                # Exon
7272                if use_exon:
7273                    exon = transcript.find_exon_number(pos)
7274                else:
7275                    exon = None
7276                # Protein
7277                transcript_protein = None
7278                if use_protein or add_protein or full_format:
7279                    transcripts_protein = list(
7280                        polars_conn.execute(
7281                            f"""
7282                        SELECT protein
7283                        FROM refseqlink_df
7284                        WHERE transcript='{transcript_name}'
7285                        LIMIT 1
7286                    """
7287                        )["protein"]
7288                    )
7289                    if len(transcripts_protein):
7290                        transcript_protein = transcripts_protein[0]
7291
7292                # HGVS name
7293                hgvs_name = format_hgvs_name(
7294                    chr,
7295                    pos,
7296                    ref,
7297                    alt,
7298                    genome=genome,
7299                    transcript=transcript,
7300                    transcript_protein=transcript_protein,
7301                    exon=exon,
7302                    use_gene=use_gene,
7303                    use_protein=use_protein,
7304                    full_format=full_format,
7305                    use_version=use_version,
7306                    codon_type=codon_type,
7307                )
7308                hgvs_full_list.append(hgvs_name)
7309                if add_protein and not use_protein and not full_format:
7310                    hgvs_name = format_hgvs_name(
7311                        chr,
7312                        pos,
7313                        ref,
7314                        alt,
7315                        genome=genome,
7316                        transcript=transcript,
7317                        transcript_protein=transcript_protein,
7318                        exon=exon,
7319                        use_gene=use_gene,
7320                        use_protein=True,
7321                        full_format=False,
7322                        use_version=use_version,
7323                        codon_type=codon_type,
7324                    )
7325                    hgvs_full_list.append(hgvs_name)
7326
7327            # Create liste of HGVS annotations
7328            hgvs_full = ",".join(hgvs_full_list)
7329
7330            return hgvs_full
7331
7332        # Polars connexion
7333        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7334
7335        # Config
7336        config = self.get_config()
7337
7338        # Databases
7339        # Genome
7340        databases_genomes_folders = (
7341            config.get("folders", {})
7342            .get("databases", {})
7343            .get("genomes", DEFAULT_GENOME_FOLDER)
7344        )
7345        databases_genome = (
7346            config.get("folders", {}).get("databases", {}).get("genomes", "")
7347        )
7348        # refseq database folder
7349        databases_refseq_folders = (
7350            config.get("folders", {})
7351            .get("databases", {})
7352            .get("refseq", DEFAULT_REFSEQ_FOLDER)
7353        )
7354        # refseq
7355        databases_refseq = config.get("databases", {}).get("refSeq", None)
7356        # refSeqLink
7357        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
7358
7359        # Param
7360        param = self.get_param()
7361
7362        # Quick HGVS
7363        if "hgvs_options" in param and param.get("hgvs_options", ""):
7364            log.info(f"Quick HGVS Annotation:")
7365            if not param.get("hgvs", None):
7366                param["hgvs"] = {}
7367            for option in param.get("hgvs_options", "").split(","):
7368                option_var_val = option.split("=")
7369                option_var = option_var_val[0]
7370                if len(option_var_val) > 1:
7371                    option_val = option_var_val[1]
7372                else:
7373                    option_val = "True"
7374                if option_val.upper() in ["TRUE"]:
7375                    option_val = True
7376                elif option_val.upper() in ["FALSE"]:
7377                    option_val = False
7378                log.info(f"   {option_var}={option_val}")
7379                param["hgvs"][option_var] = option_val
7380
7381        # Check if HGVS annotation enabled
7382        if "hgvs" in param:
7383            log.info(f"HGVS Annotation... ")
7384            for hgvs_option in param.get("hgvs", {}):
7385                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
7386        else:
7387            return
7388
7389        # HGVS Param
7390        param_hgvs = param.get("hgvs", {})
7391        use_exon = param_hgvs.get("use_exon", False)
7392        use_gene = param_hgvs.get("use_gene", False)
7393        use_protein = param_hgvs.get("use_protein", False)
7394        add_protein = param_hgvs.get("add_protein", False)
7395        full_format = param_hgvs.get("full_format", False)
7396        use_version = param_hgvs.get("use_version", False)
7397        codon_type = param_hgvs.get("codon_type", "3")
7398
7399        # refSseq refSeqLink
7400        databases_refseq = param_hgvs.get("refseq", databases_refseq)
7401        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
7402
7403        # Assembly
7404        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
7405
7406        # Genome
7407        genome_file = None
7408        if find_genome(databases_genome):
7409            genome_file = find_genome(databases_genome)
7410        else:
7411            genome_file = find_genome(
7412                genome_path=databases_genomes_folders, assembly=assembly
7413            )
7414        log.debug("Genome: " + str(genome_file))
7415
7416        # refSseq
7417        refseq_file = find_file_prefix(
7418            input_file=databases_refseq,
7419            prefix="ncbiRefSeq",
7420            folder=databases_refseq_folders,
7421            assembly=assembly,
7422        )
7423        log.debug("refSeq: " + str(refseq_file))
7424
7425        # refSeqLink
7426        refseqlink_file = find_file_prefix(
7427            input_file=databases_refseqlink,
7428            prefix="ncbiRefSeqLink",
7429            folder=databases_refseq_folders,
7430            assembly=assembly,
7431        )
7432        log.debug("refSeqLink: " + str(refseqlink_file))
7433
7434        # Threads
7435        if not threads:
7436            threads = self.get_threads()
7437        log.debug("Threads: " + str(threads))
7438
7439        # Variables
7440        table_variants = self.get_table_variants(clause="update")
7441
7442        # Get variants SNV and InDel only
7443        query_variants = f"""
7444            SELECT "#CHROM" AS CHROM, POS, REF, ALT
7445            FROM {table_variants}
7446            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
7447            """
7448        df_variants = self.get_query_to_df(query_variants)
7449
7450        # Added columns
7451        added_columns = []
7452
7453        # Add hgvs column in variants table
7454        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
7455        added_column = self.add_column(
7456            table_variants, hgvs_column_name, "STRING", default_value=None
7457        )
7458        added_columns.append(added_column)
7459
7460        log.debug(f"refSeq loading...")
7461        # refSeq in duckDB
7462        refseq_table = get_refseq_table(
7463            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
7464        )
7465        # Loading all refSeq in Dataframe
7466        refseq_query = f"""
7467            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
7468            FROM {refseq_table}
7469            JOIN df_variants ON (
7470                {refseq_table}.chrom = df_variants.CHROM
7471                AND {refseq_table}.txStart<=df_variants.POS
7472                AND {refseq_table}.txEnd>=df_variants.POS
7473            )
7474        """
7475        refseq_df = self.conn.query(refseq_query).pl()
7476
7477        if refseqlink_file:
7478            log.debug(f"refSeqLink loading...")
7479            # refSeqLink in duckDB
7480            refseqlink_table = get_refseq_table(
7481                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
7482            )
7483            # Loading all refSeqLink in Dataframe
7484            protacc_column = "protAcc_with_ver"
7485            mrnaacc_column = "mrnaAcc_with_ver"
7486            refseqlink_query = f"""
7487                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
7488                FROM {refseqlink_table} 
7489                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
7490                WHERE protAcc_without_ver IS NOT NULL
7491            """
7492            # Polars Dataframe
7493            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
7494
7495        # Read RefSeq transcripts into a python dict/model.
7496        log.debug(f"Transcripts loading...")
7497        with tempfile.TemporaryDirectory() as tmpdir:
7498            transcripts_query = f"""
7499                COPY (
7500                    SELECT {refseq_table}.*
7501                    FROM {refseq_table}
7502                    JOIN df_variants ON (
7503                        {refseq_table}.chrom=df_variants.CHROM
7504                        AND {refseq_table}.txStart<=df_variants.POS
7505                        AND {refseq_table}.txEnd>=df_variants.POS
7506                    )
7507                )
7508                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
7509            """
7510            self.conn.query(transcripts_query)
7511            with open(f"{tmpdir}/transcript.tsv") as infile:
7512                transcripts = read_transcripts(infile)
7513
7514        # Polars connexion
7515        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7516
7517        log.debug("Genome loading...")
7518        # Read genome sequence using pyfaidx.
7519        genome = Fasta(genome_file)
7520
7521        log.debug("Start annotation HGVS...")
7522
7523        # Create
7524        # a Dask Dataframe from Pandas dataframe with partition as number of threads
7525        ddf = dd.from_pandas(df_variants, npartitions=threads)
7526
7527        # Use dask.dataframe.apply() to apply function on each partition
7528        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
7529
7530        # Convert Dask DataFrame to Pandas Dataframe
7531        df = ddf.compute()
7532
7533        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
7534        with tempfile.TemporaryDirectory() as tmpdir:
7535            df_parquet = os.path.join(tmpdir, "df.parquet")
7536            df.to_parquet(df_parquet)
7537
7538            # Update hgvs column
7539            update_variant_query = f"""
7540                UPDATE {table_variants}
7541                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
7542                FROM read_parquet('{df_parquet}') as df
7543                WHERE variants."#CHROM" = df.CHROM
7544                AND variants.POS = df.POS
7545                AND variants.REF = df.REF
7546                AND variants.ALT = df.ALT
7547                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
7548                """
7549            self.execute_query(update_variant_query)
7550
7551        # Update INFO column
7552        sql_query_update = f"""
7553            UPDATE {table_variants}
7554            SET INFO = 
7555                concat(
7556                    CASE 
7557                        WHEN INFO NOT IN ('','.')
7558                        THEN concat(INFO, ';')
7559                        ELSE ''
7560                    END,
7561                    'hgvs=',
7562                    {hgvs_column_name}
7563                )
7564            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
7565            """
7566        self.execute_query(sql_query_update)
7567
7568        # Add header
7569        HGVS_INFOS = {
7570            "hgvs": {
7571                "ID": "hgvs",
7572                "Number": ".",
7573                "Type": "String",
7574                "Description": f"HGVS annotatation with HOWARD",
7575            }
7576        }
7577
7578        for field in HGVS_INFOS:
7579            field_ID = HGVS_INFOS[field]["ID"]
7580            field_description = HGVS_INFOS[field]["Description"]
7581            self.get_header().infos[field_ID] = vcf.parser._Info(
7582                field_ID,
7583                HGVS_INFOS[field]["Number"],
7584                HGVS_INFOS[field]["Type"],
7585                field_description,
7586                "unknown",
7587                "unknown",
7588                code_type_map[HGVS_INFOS[field]["Type"]],
7589            )
7590
7591        # Remove added columns
7592        for added_column in added_columns:
7593            self.drop_column(column=added_column)

The annotation_hgvs function performs HGVS annotation on a set of variants using genomic coordinates and alleles.

Parameters
  • threads: The threads parameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from the get_threads() method
def get_operations_help( self, operations_config_dict: dict = {}, operations_config_file: str = None) -> list:
7599    def get_operations_help(
7600        self, operations_config_dict: dict = {}, operations_config_file: str = None
7601    ) -> list:
7602
7603        # Init
7604        operations_help = []
7605
7606        # operations
7607        operations = self.get_config_json(
7608            name="calculations",
7609            config_dict=operations_config_dict,
7610            config_file=operations_config_file,
7611        )
7612        for op in operations:
7613            op_name = operations[op].get("name", op).upper()
7614            op_description = operations[op].get("description", op_name)
7615            op_available = operations[op].get("available", False)
7616            if op_available:
7617                operations_help.append(f"   {op_name}: {op_description}")
7618
7619        # Sort operations
7620        operations_help.sort()
7621
7622        # insert header
7623        operations_help.insert(0, "Available calculation operations:")
7624
7625        # Return
7626        return operations_help
def calculation( self, operations: dict = {}, operations_config_dict: dict = {}, operations_config_file: str = None) -> None:
7628    def calculation(
7629        self,
7630        operations: dict = {},
7631        operations_config_dict: dict = {},
7632        operations_config_file: str = None,
7633    ) -> None:
7634        """
7635        It takes a list of operations, and for each operation, it checks if it's a python or sql
7636        operation, and then calls the appropriate function
7637
7638        param json example:
7639            "calculation": {
7640                "NOMEN": {
7641                    "options": {
7642                        "hgvs_field": "hgvs"
7643                    },
7644                "middle" : null
7645            }
7646        """
7647
7648        # Param
7649        param = self.get_param()
7650
7651        # operations config
7652        operations_config = self.get_config_json(
7653            name="calculations",
7654            config_dict=operations_config_dict,
7655            config_file=operations_config_file,
7656        )
7657
7658        # Upper keys
7659        operations_config = {k.upper(): v for k, v in operations_config.items()}
7660
7661        # Calculations
7662
7663        # Operations from param
7664        operations = param.get("calculation", {}).get("calculations", operations)
7665
7666        # Quick calculation - add
7667        if param.get("calculations", None):
7668            calculations_list = [
7669                value for value in param.get("calculations", "").split(",")
7670            ]
7671            log.info(f"Quick Calculations:")
7672            for calculation_key in calculations_list:
7673                log.info(f"   {calculation_key}")
7674            for calculation_operation in calculations_list:
7675                if calculation_operation.upper() not in operations:
7676                    operations[calculation_operation.upper()] = {}
7677                    add_value_into_dict(
7678                        dict_tree=param,
7679                        sections=[
7680                            "calculation",
7681                            "calculations",
7682                            calculation_operation.upper(),
7683                        ],
7684                        value={},
7685                    )
7686
7687        # Operations for calculation
7688        if not operations:
7689            operations = param.get("calculation", {}).get("calculations", {})
7690
7691        if operations:
7692            log.info(f"Calculations...")
7693
7694        # For each operations
7695        for operation_name in operations:
7696            operation_name = operation_name.upper()
7697            if operation_name not in [""]:
7698                if operation_name in operations_config:
7699                    log.info(f"Calculation '{operation_name}'")
7700                    operation = operations_config[operation_name]
7701                    operation_type = operation.get("type", "sql")
7702                    if operation_type == "python":
7703                        self.calculation_process_function(
7704                            operation=operation, operation_name=operation_name
7705                        )
7706                    elif operation_type == "sql":
7707                        self.calculation_process_sql(
7708                            operation=operation, operation_name=operation_name
7709                        )
7710                    else:
7711                        log.error(
7712                            f"Operations config: Type '{operation_type}' NOT available"
7713                        )
7714                        raise ValueError(
7715                            f"Operations config: Type '{operation_type}' NOT available"
7716                        )
7717                else:
7718                    log.error(
7719                        f"Operations config: Calculation '{operation_name}' NOT available"
7720                    )
7721                    raise ValueError(
7722                        f"Operations config: Calculation '{operation_name}' NOT available"
7723                    )
7724
7725        # Explode INFOS fields into table fields
7726        if self.get_explode_infos():
7727            self.explode_infos(
7728                prefix=self.get_explode_infos_prefix(),
7729                fields=self.get_explode_infos_fields(),
7730                force=True,
7731            )

It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function

param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }

def calculation_process_sql(self, operation: dict, operation_name: str = 'unknown') -> None:
7733    def calculation_process_sql(
7734        self, operation: dict, operation_name: str = "unknown"
7735    ) -> None:
7736        """
7737        The `calculation_process_sql` function takes in a mathematical operation as a string and
7738        performs the operation, updating the specified table with the result.
7739
7740        :param operation: The `operation` parameter is a dictionary that contains information about the
7741        mathematical operation to be performed. It includes the following keys:
7742        :type operation: dict
7743        :param operation_name: The `operation_name` parameter is a string that represents the name of
7744        the mathematical operation being performed. It is used for logging and error handling purposes,
7745        defaults to unknown
7746        :type operation_name: str (optional)
7747        """
7748
7749        # table variants
7750        table_variants = self.get_table_variants(clause="alter")
7751
7752        # Operation infos
7753        operation_name = operation.get("name", "unknown")
7754        log.debug(f"process sql {operation_name}")
7755        output_column_name = operation.get("output_column_name", operation_name)
7756        output_column_type = operation.get("output_column_type", "String")
7757        prefix = operation.get("explode_infos_prefix", "")
7758        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
7759        output_column_description = operation.get(
7760            "output_column_description", f"{operation_name} operation"
7761        )
7762        operation_query = operation.get("operation_query", None)
7763        if isinstance(operation_query, list):
7764            operation_query = " ".join(operation_query)
7765        operation_info_fields = operation.get("info_fields", [])
7766        operation_info_fields_check = operation.get("info_fields_check", False)
7767        operation_info = operation.get("operation_info", True)
7768
7769        if operation_query:
7770
7771            # Info fields check
7772            operation_info_fields_check_result = True
7773            if operation_info_fields_check:
7774                header_infos = self.get_header().infos
7775                for info_field in operation_info_fields:
7776                    operation_info_fields_check_result = (
7777                        operation_info_fields_check_result
7778                        and info_field in header_infos
7779                    )
7780
7781            # If info fields available
7782            if operation_info_fields_check_result:
7783
7784                # Added_columns
7785                added_columns = []
7786
7787                # Create VCF header field
7788                vcf_reader = self.get_header()
7789                vcf_reader.infos[output_column_name] = vcf.parser._Info(
7790                    output_column_name,
7791                    ".",
7792                    output_column_type,
7793                    output_column_description,
7794                    "howard calculation",
7795                    "0",
7796                    self.code_type_map.get(output_column_type),
7797                )
7798
7799                # Explode infos if needed
7800                log.debug(f"calculation_process_sql prefix {prefix}")
7801                added_columns += self.explode_infos(
7802                    prefix=prefix,
7803                    fields=[output_column_name] + operation_info_fields,
7804                    force=True,
7805                )
7806
7807                # Create column
7808                added_column = self.add_column(
7809                    table_name=table_variants,
7810                    column_name=prefix + output_column_name,
7811                    column_type=output_column_type_sql,
7812                    default_value="null",
7813                )
7814                added_columns.append(added_column)
7815
7816                # Operation calculation
7817                try:
7818
7819                    # Query to update calculation column
7820                    sql_update = f"""
7821                        UPDATE {table_variants}
7822                        SET "{prefix}{output_column_name}" = ({operation_query})
7823                    """
7824                    self.conn.execute(sql_update)
7825
7826                    # Add to INFO
7827                    if operation_info:
7828                        sql_update_info = f"""
7829                            UPDATE {table_variants}
7830                            SET "INFO" =
7831                                concat(
7832                                    CASE
7833                                        WHEN "INFO" IS NOT NULL
7834                                        THEN concat("INFO", ';')
7835                                        ELSE ''
7836                                    END,
7837                                    '{output_column_name}=',
7838                                    "{prefix}{output_column_name}"
7839                                )
7840                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
7841                        """
7842                        self.conn.execute(sql_update_info)
7843
7844                except:
7845                    log.error(
7846                        f"Operations config: Calculation '{operation_name}' query failed"
7847                    )
7848                    raise ValueError(
7849                        f"Operations config: Calculation '{operation_name}' query failed"
7850                    )
7851
7852                # Remove added columns
7853                for added_column in added_columns:
7854                    log.debug(f"added_column: {added_column}")
7855                    self.drop_column(column=added_column)
7856
7857            else:
7858                log.error(
7859                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7860                )
7861                raise ValueError(
7862                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
7863                )
7864
7865        else:
7866            log.error(
7867                f"Operations config: Calculation '{operation_name}' query NOT defined"
7868            )
7869            raise ValueError(
7870                f"Operations config: Calculation '{operation_name}' query NOT defined"
7871            )

The calculation_process_sql function takes in a mathematical operation as a string and performs the operation, updating the specified table with the result.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
def calculation_process_function(self, operation: dict, operation_name: str = 'unknown') -> None:
7873    def calculation_process_function(
7874        self, operation: dict, operation_name: str = "unknown"
7875    ) -> None:
7876        """
7877        The `calculation_process_function` takes in an operation dictionary and performs the specified
7878        function with the given parameters.
7879
7880        :param operation: The `operation` parameter is a dictionary that contains information about the
7881        operation to be performed. It has the following keys:
7882        :type operation: dict
7883        :param operation_name: The `operation_name` parameter is a string that represents the name of
7884        the operation being performed. It is used for logging purposes, defaults to unknown
7885        :type operation_name: str (optional)
7886        """
7887
7888        operation_name = operation["name"]
7889        log.debug(f"process sql {operation_name}")
7890        function_name = operation["function_name"]
7891        function_params = operation["function_params"]
7892        getattr(self, function_name)(*function_params)

The calculation_process_function takes in an operation dictionary and performs the specified function with the given parameters.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the operation to be performed. It has the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
def calculation_variant_id(self) -> None:
7894    def calculation_variant_id(self) -> None:
7895        """
7896        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
7897        updates the INFO field of a variants table with the variant ID.
7898        """
7899
7900        # variant_id annotation field
7901        variant_id_tag = self.get_variant_id_column()
7902        added_columns = [variant_id_tag]
7903
7904        # variant_id hgvs tags"
7905        vcf_infos_tags = {
7906            variant_id_tag: "howard variant ID annotation",
7907        }
7908
7909        # Variants table
7910        table_variants = self.get_table_variants()
7911
7912        # Header
7913        vcf_reader = self.get_header()
7914
7915        # Add variant_id to header
7916        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
7917            variant_id_tag,
7918            ".",
7919            "String",
7920            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
7921            "howard calculation",
7922            "0",
7923            self.code_type_map.get("String"),
7924        )
7925
7926        # Update
7927        sql_update = f"""
7928            UPDATE {table_variants}
7929            SET "INFO" = 
7930                concat(
7931                    CASE
7932                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
7933                        THEN ''
7934                        ELSE concat("INFO", ';')
7935                    END,
7936                    '{variant_id_tag}=',
7937                    "{variant_id_tag}"
7938                )
7939        """
7940        self.conn.execute(sql_update)
7941
7942        # Remove added columns
7943        for added_column in added_columns:
7944            self.drop_column(column=added_column)

The function calculation_variant_id adds a variant ID annotation to a VCF file header and updates the INFO field of a variants table with the variant ID.

def calculation_extract_snpeff_hgvs( self, snpeff_hgvs: str = 'snpeff_hgvs', snpeff_field: str = 'ANN') -> None:
7946    def calculation_extract_snpeff_hgvs(
7947        self,
7948        snpeff_hgvs: str = "snpeff_hgvs",
7949        snpeff_field: str = "ANN",
7950    ) -> None:
7951        """
7952        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
7953        annotation field in a VCF file and adds them as a new column in the variants table.
7954
7955        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
7956        function is used to specify the name of the column that will store the HGVS nomenclatures
7957        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
7958        snpeff_hgvs
7959        :type snpeff_hgvs: str (optional)
7960        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
7961        function represents the field in the VCF file that contains SnpEff annotations. This field is
7962        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
7963        to ANN
7964        :type snpeff_field: str (optional)
7965        """
7966
7967        # Snpeff hgvs tags
7968        vcf_infos_tags = {
7969            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
7970        }
7971
7972        # Prefix
7973        prefix = self.get_explode_infos_prefix()
7974        if prefix:
7975            prefix = "INFO/"
7976
7977        # snpEff fields
7978        speff_ann_infos = prefix + snpeff_field
7979        speff_hgvs_infos = prefix + snpeff_hgvs
7980
7981        # Variants table
7982        table_variants = self.get_table_variants()
7983
7984        # Header
7985        vcf_reader = self.get_header()
7986
7987        # Add columns
7988        added_columns = []
7989
7990        # Explode HGVS field in column
7991        added_columns += self.explode_infos(fields=[snpeff_field])
7992
7993        if snpeff_field in vcf_reader.infos:
7994
7995            log.debug(vcf_reader.infos[snpeff_field])
7996
7997            # Extract ANN header
7998            ann_description = vcf_reader.infos[snpeff_field].desc
7999            pattern = r"'(.+?)'"
8000            match = re.search(pattern, ann_description)
8001            if match:
8002                ann_header_match = match.group(1).split(" | ")
8003                ann_header_desc = {}
8004                for i in range(len(ann_header_match)):
8005                    ann_header_info = "".join(
8006                        char for char in ann_header_match[i] if char.isalnum()
8007                    )
8008                    ann_header_desc[ann_header_info] = ann_header_match[i]
8009                if not ann_header_desc:
8010                    raise ValueError("Invalid header description format")
8011            else:
8012                raise ValueError("Invalid header description format")
8013
8014            # Create variant id
8015            variant_id_column = self.get_variant_id_column()
8016            added_columns += [variant_id_column]
8017
8018            # Create dataframe
8019            dataframe_snpeff_hgvs = self.get_query_to_df(
8020                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8021            )
8022
8023            # Create main NOMEN column
8024            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8025                speff_ann_infos
8026            ].apply(
8027                lambda x: extract_snpeff_hgvs(
8028                    str(x), header=list(ann_header_desc.values())
8029                )
8030            )
8031
8032            # Add snpeff_hgvs to header
8033            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
8034                snpeff_hgvs,
8035                ".",
8036                "String",
8037                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
8038                "howard calculation",
8039                "0",
8040                self.code_type_map.get("String"),
8041            )
8042
8043            # Update
8044            sql_update = f"""
8045                UPDATE variants
8046                SET "INFO" = 
8047                    concat(
8048                        CASE
8049                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8050                            THEN ''
8051                            ELSE concat("INFO", ';')
8052                        END,
8053                        CASE 
8054                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8055                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8056                            THEN concat(
8057                                    '{snpeff_hgvs}=',
8058                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8059                                )
8060                            ELSE ''
8061                        END
8062                    )
8063                FROM dataframe_snpeff_hgvs
8064                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8065
8066            """
8067            self.conn.execute(sql_update)
8068
8069            # Delete dataframe
8070            del dataframe_snpeff_hgvs
8071            gc.collect()
8072
8073        else:
8074
8075            log.warning(
8076                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8077            )
8078
8079        # Remove added columns
8080        for added_column in added_columns:
8081            self.drop_column(column=added_column)

The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff annotation field in a VCF file and adds them as a new column in the variants table.

Parameters
  • snpeff_hgvs: The snpeff_hgvs parameter in the calculation_extract_snpeff_hgvs function is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs
  • snpeff_field: The snpeff_field parameter in the calculation_extract_snpeff_hgvs function represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
def calculation_snpeff_ann_explode( self, uniquify: bool = True, output_format: str = 'fields', output_prefix: str = 'snpeff_', snpeff_field: str = 'ANN') -> None:
8083    def calculation_snpeff_ann_explode(
8084        self,
8085        uniquify: bool = True,
8086        output_format: str = "fields",
8087        output_prefix: str = "snpeff_",
8088        snpeff_field: str = "ANN",
8089    ) -> None:
8090        """
8091        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
8092        exploding the HGVS field and updating variant information accordingly.
8093
8094        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
8095        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
8096        it indicates that the output should be unique, meaning that duplicate entries should be removed,
8097        defaults to True
8098        :type uniquify: bool (optional)
8099        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
8100        function specifies the format in which the output annotations will be generated. It has a
8101        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
8102        format, defaults to fields
8103        :type output_format: str (optional)
8104        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
8105        method is used to specify the prefix that will be added to the output annotations generated
8106        during the calculation process. This prefix helps to differentiate the newly added annotations
8107        from existing ones in the output data. By default, the, defaults to ANN_
8108        :type output_prefix: str (optional)
8109        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
8110        function is used to specify the field in the VCF file that contains SnpEff annotations. This
8111        field will be processed to explode the HGVS annotations and update the variant information
8112        accordingly, defaults to ANN
8113        :type snpeff_field: str (optional)
8114        """
8115
8116        # SnpEff annotation field
8117        snpeff_hgvs = "snpeff_ann_explode"
8118
8119        # Snpeff hgvs tags
8120        vcf_infos_tags = {
8121            snpeff_hgvs: "Explode snpEff annotations",
8122        }
8123
8124        # Prefix
8125        prefix = self.get_explode_infos_prefix()
8126        if prefix:
8127            prefix = "INFO/"
8128
8129        # snpEff fields
8130        speff_ann_infos = prefix + snpeff_field
8131        speff_hgvs_infos = prefix + snpeff_hgvs
8132
8133        # Variants table
8134        table_variants = self.get_table_variants()
8135
8136        # Header
8137        vcf_reader = self.get_header()
8138
8139        # Add columns
8140        added_columns = []
8141
8142        # Explode HGVS field in column
8143        added_columns += self.explode_infos(fields=[snpeff_field])
8144        log.debug(f"snpeff_field={snpeff_field}")
8145        log.debug(f"added_columns={added_columns}")
8146
8147        if snpeff_field in vcf_reader.infos:
8148
8149            # Extract ANN header
8150            ann_description = vcf_reader.infos[snpeff_field].desc
8151            pattern = r"'(.+?)'"
8152            match = re.search(pattern, ann_description)
8153            if match:
8154                ann_header_match = match.group(1).split(" | ")
8155                ann_header = []
8156                ann_header_desc = {}
8157                for i in range(len(ann_header_match)):
8158                    ann_header_info = "".join(
8159                        char for char in ann_header_match[i] if char.isalnum()
8160                    )
8161                    ann_header.append(ann_header_info)
8162                    ann_header_desc[ann_header_info] = ann_header_match[i]
8163                if not ann_header_desc:
8164                    raise ValueError("Invalid header description format")
8165            else:
8166                raise ValueError("Invalid header description format")
8167
8168            # Create variant id
8169            variant_id_column = self.get_variant_id_column()
8170            added_columns += [variant_id_column]
8171
8172            # Create dataframe
8173            dataframe_snpeff_hgvs = self.get_query_to_df(
8174                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8175            )
8176
8177            # Create snpEff columns
8178            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8179                speff_ann_infos
8180            ].apply(
8181                lambda x: explode_snpeff_ann(
8182                    str(x),
8183                    uniquify=uniquify,
8184                    output_format=output_format,
8185                    prefix=output_prefix,
8186                    header=list(ann_header_desc.values()),
8187                )
8188            )
8189
8190            # Header
8191            ann_annotations_prefix = ""
8192            if output_format.upper() in ["JSON"]:
8193                ann_annotations_prefix = f"{output_prefix}="
8194                vcf_reader.infos[output_prefix] = vcf.parser._Info(
8195                    output_prefix,
8196                    ".",
8197                    "String",
8198                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8199                    + " - JSON format",
8200                    "howard calculation",
8201                    "0",
8202                    self.code_type_map.get("String"),
8203                )
8204            else:
8205                for ann_annotation in ann_header:
8206                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
8207                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
8208                        ann_annotation_id,
8209                        ".",
8210                        "String",
8211                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8212                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
8213                        "howard calculation",
8214                        "0",
8215                        self.code_type_map.get("String"),
8216                    )
8217
8218            # Update
8219            sql_update = f"""
8220                UPDATE variants
8221                SET "INFO" = 
8222                    concat(
8223                        CASE
8224                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8225                            THEN ''
8226                            ELSE concat("INFO", ';')
8227                        END,
8228                        CASE 
8229                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8230                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8231                            THEN concat(
8232                                '{ann_annotations_prefix}',
8233                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8234                                )
8235                            ELSE ''
8236                        END
8237                    )
8238                FROM dataframe_snpeff_hgvs
8239                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8240
8241            """
8242            self.conn.execute(sql_update)
8243
8244            # Delete dataframe
8245            del dataframe_snpeff_hgvs
8246            gc.collect()
8247
8248        else:
8249
8250            log.warning(
8251                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8252            )
8253
8254        # Remove added columns
8255        for added_column in added_columns:
8256            self.drop_column(column=added_column)

The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by exploding the HGVS field and updating variant information accordingly.

Parameters
  • uniquify: The uniquify parameter in the calculation_snpeff_ann_explode method is a boolean flag that determines whether the output should be uniquified or not. When set to True, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True
  • output_format: The output_format parameter in the calculation_snpeff_ann_explode function specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields
  • output_prefix: The output_prefix parameter in the calculation_snpeff_ann_explode method is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_
  • snpeff_field: The snpeff_field parameter in the calculation_snpeff_ann_explode function is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
def calculation_extract_nomen(self) -> None:
8258    def calculation_extract_nomen(self) -> None:
8259        """
8260        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8261        """
8262
8263        # NOMEN field
8264        field_nomen_dict = "NOMEN_DICT"
8265
8266        # NOMEN structure
8267        nomen_dict = {
8268            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
8269            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
8270            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
8271            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
8272            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
8273            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
8274            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
8275            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
8276            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
8277            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
8278        }
8279
8280        # Param
8281        param = self.get_param()
8282
8283        # Prefix
8284        prefix = self.get_explode_infos_prefix()
8285
8286        # Header
8287        vcf_reader = self.get_header()
8288
8289        # Get HGVS field
8290        hgvs_field = (
8291            param.get("calculation", {})
8292            .get("calculations", {})
8293            .get("NOMEN", {})
8294            .get("options", {})
8295            .get("hgvs_field", "hgvs")
8296        )
8297
8298        # Get transcripts
8299        transcripts_file = (
8300            param.get("calculation", {})
8301            .get("calculations", {})
8302            .get("NOMEN", {})
8303            .get("options", {})
8304            .get("transcripts", None)
8305        )
8306        transcripts_file = full_path(transcripts_file)
8307        transcripts = []
8308        if transcripts_file:
8309            if os.path.exists(transcripts_file):
8310                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
8311                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
8312            else:
8313                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
8314                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
8315
8316        # Added columns
8317        added_columns = []
8318
8319        # Explode HGVS field in column
8320        added_columns += self.explode_infos(fields=[hgvs_field])
8321
8322        # extra infos
8323        extra_infos = self.get_extra_infos()
8324        extra_field = prefix + hgvs_field
8325
8326        if extra_field in extra_infos:
8327
8328            # Create dataframe
8329            dataframe_hgvs = self.get_query_to_df(
8330                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
8331            )
8332
8333            # Create main NOMEN column
8334            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
8335                lambda x: find_nomen(str(x), transcripts=transcripts)
8336            )
8337
8338            # Explode NOMEN Structure and create SQL set for update
8339            sql_nomen_fields = []
8340            for nomen_field in nomen_dict:
8341
8342                # Explode each field into a column
8343                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
8344                    lambda x: dict(x).get(nomen_field, "")
8345                )
8346
8347                # Create VCF header field
8348                vcf_reader.infos[nomen_field] = vcf.parser._Info(
8349                    nomen_field,
8350                    ".",
8351                    "String",
8352                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
8353                    "howard calculation",
8354                    "0",
8355                    self.code_type_map.get("String"),
8356                )
8357                sql_nomen_fields.append(
8358                    f"""
8359                        CASE 
8360                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
8361                            THEN concat(
8362                                    ';{nomen_field}=',
8363                                    dataframe_hgvs."{nomen_field}"
8364                                )
8365                            ELSE ''
8366                        END
8367                    """
8368                )
8369
8370            # SQL set for update
8371            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
8372
8373            # Update
8374            sql_update = f"""
8375                UPDATE variants
8376                SET "INFO" = 
8377                    concat(
8378                        CASE
8379                            WHEN "INFO" IS NULL
8380                            THEN ''
8381                            ELSE "INFO"
8382                        END,
8383                        {sql_nomen_fields_set}
8384                    )
8385                FROM dataframe_hgvs
8386                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
8387                    AND variants."POS" = dataframe_hgvs."POS" 
8388                    AND variants."REF" = dataframe_hgvs."REF"
8389                    AND variants."ALT" = dataframe_hgvs."ALT"
8390            """
8391            self.conn.execute(sql_update)
8392
8393            # Delete dataframe
8394            del dataframe_hgvs
8395            gc.collect()
8396
8397        # Remove added columns
8398        for added_column in added_columns:
8399            self.drop_column(column=added_column)

This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.

def calculation_find_by_pipeline(self, tag: str = 'findbypipeline') -> None:
8401    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
8402        """
8403        The function `calculation_find_by_pipeline` performs a calculation to find the number of
8404        pipeline/sample for a variant and updates the variant information in a VCF file.
8405
8406        :param tag: The `tag` parameter is a string that represents the annotation field for the
8407        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
8408        VCF header and to update the corresponding field in the variants table, defaults to
8409        findbypipeline
8410        :type tag: str (optional)
8411        """
8412
8413        # if FORMAT and samples
8414        if (
8415            "FORMAT" in self.get_header_columns_as_list()
8416            and self.get_header_sample_list()
8417        ):
8418
8419            # findbypipeline annotation field
8420            findbypipeline_tag = tag
8421
8422            # VCF infos tags
8423            vcf_infos_tags = {
8424                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
8425            }
8426
8427            # Prefix
8428            prefix = self.get_explode_infos_prefix()
8429
8430            # Field
8431            findbypipeline_infos = prefix + findbypipeline_tag
8432
8433            # Variants table
8434            table_variants = self.get_table_variants()
8435
8436            # Header
8437            vcf_reader = self.get_header()
8438
8439            # Create variant id
8440            variant_id_column = self.get_variant_id_column()
8441            added_columns = [variant_id_column]
8442
8443            # variant_id, FORMAT and samples
8444            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8445                self.get_header_sample_list()
8446            )
8447
8448            # Create dataframe
8449            dataframe_findbypipeline = self.get_query_to_df(
8450                f""" SELECT {samples_fields} FROM {table_variants} """
8451            )
8452
8453            # Create findbypipeline column
8454            dataframe_findbypipeline[findbypipeline_infos] = (
8455                dataframe_findbypipeline.apply(
8456                    lambda row: findbypipeline(
8457                        row, samples=self.get_header_sample_list()
8458                    ),
8459                    axis=1,
8460                )
8461            )
8462
8463            # Add snpeff_hgvs to header
8464            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
8465                findbypipeline_tag,
8466                ".",
8467                "String",
8468                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
8469                "howard calculation",
8470                "0",
8471                self.code_type_map.get("String"),
8472            )
8473
8474            # Update
8475            sql_update = f"""
8476                UPDATE variants
8477                SET "INFO" = 
8478                    concat(
8479                        CASE
8480                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8481                            THEN ''
8482                            ELSE concat("INFO", ';')
8483                        END,
8484                        CASE 
8485                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
8486                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
8487                            THEN concat(
8488                                    '{findbypipeline_tag}=',
8489                                    dataframe_findbypipeline."{findbypipeline_infos}"
8490                                )
8491                            ELSE ''
8492                        END
8493                    )
8494                FROM dataframe_findbypipeline
8495                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
8496            """
8497            self.conn.execute(sql_update)
8498
8499            # Remove added columns
8500            for added_column in added_columns:
8501                self.drop_column(column=added_column)
8502
8503            # Delete dataframe
8504            del dataframe_findbypipeline
8505            gc.collect()

The function calculation_find_by_pipeline performs a calculation to find the number of pipeline/sample for a variant and updates the variant information in a VCF file.

Parameters
  • tag: The tag parameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
def calculation_genotype_concordance(self) -> None:
8507    def calculation_genotype_concordance(self) -> None:
8508        """
8509        The function `calculation_genotype_concordance` calculates the genotype concordance for
8510        multi-caller VCF files and updates the variant information in the database.
8511        """
8512
8513        # if FORMAT and samples
8514        if (
8515            "FORMAT" in self.get_header_columns_as_list()
8516            and self.get_header_sample_list()
8517        ):
8518
8519            # genotypeconcordance annotation field
8520            genotypeconcordance_tag = "genotypeconcordance"
8521
8522            # VCF infos tags
8523            vcf_infos_tags = {
8524                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
8525            }
8526
8527            # Prefix
8528            prefix = self.get_explode_infos_prefix()
8529
8530            # Field
8531            genotypeconcordance_infos = prefix + genotypeconcordance_tag
8532
8533            # Variants table
8534            table_variants = self.get_table_variants()
8535
8536            # Header
8537            vcf_reader = self.get_header()
8538
8539            # Create variant id
8540            variant_id_column = self.get_variant_id_column()
8541            added_columns = [variant_id_column]
8542
8543            # variant_id, FORMAT and samples
8544            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8545                self.get_header_sample_list()
8546            )
8547
8548            # Create dataframe
8549            dataframe_genotypeconcordance = self.get_query_to_df(
8550                f""" SELECT {samples_fields} FROM {table_variants} """
8551            )
8552
8553            # Create genotypeconcordance column
8554            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
8555                dataframe_genotypeconcordance.apply(
8556                    lambda row: genotypeconcordance(
8557                        row, samples=self.get_header_sample_list()
8558                    ),
8559                    axis=1,
8560                )
8561            )
8562
8563            # Add genotypeconcordance to header
8564            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
8565                genotypeconcordance_tag,
8566                ".",
8567                "String",
8568                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
8569                "howard calculation",
8570                "0",
8571                self.code_type_map.get("String"),
8572            )
8573
8574            # Update
8575            sql_update = f"""
8576                UPDATE variants
8577                SET "INFO" = 
8578                    concat(
8579                        CASE
8580                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8581                            THEN ''
8582                            ELSE concat("INFO", ';')
8583                        END,
8584                        CASE
8585                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
8586                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
8587                            THEN concat(
8588                                    '{genotypeconcordance_tag}=',
8589                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
8590                                )
8591                            ELSE ''
8592                        END
8593                    )
8594                FROM dataframe_genotypeconcordance
8595                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
8596            """
8597            self.conn.execute(sql_update)
8598
8599            # Remove added columns
8600            for added_column in added_columns:
8601                self.drop_column(column=added_column)
8602
8603            # Delete dataframe
8604            del dataframe_genotypeconcordance
8605            gc.collect()

The function calculation_genotype_concordance calculates the genotype concordance for multi-caller VCF files and updates the variant information in the database.

def calculation_barcode(self, tag: str = 'barcode') -> None:
8607    def calculation_barcode(self, tag: str = "barcode") -> None:
8608        """
8609        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
8610        updates the INFO field in the file with the calculated barcode values.
8611
8612        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
8613        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
8614        the default tag name is set to "barcode", defaults to barcode
8615        :type tag: str (optional)
8616        """
8617
8618        # if FORMAT and samples
8619        if (
8620            "FORMAT" in self.get_header_columns_as_list()
8621            and self.get_header_sample_list()
8622        ):
8623
8624            # barcode annotation field
8625            if not tag:
8626                tag = "barcode"
8627
8628            # VCF infos tags
8629            vcf_infos_tags = {
8630                tag: "barcode calculation (VaRank)",
8631            }
8632
8633            # Prefix
8634            prefix = self.get_explode_infos_prefix()
8635
8636            # Field
8637            barcode_infos = prefix + tag
8638
8639            # Variants table
8640            table_variants = self.get_table_variants()
8641
8642            # Header
8643            vcf_reader = self.get_header()
8644
8645            # Create variant id
8646            variant_id_column = self.get_variant_id_column()
8647            added_columns = [variant_id_column]
8648
8649            # variant_id, FORMAT and samples
8650            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8651                self.get_header_sample_list()
8652            )
8653
8654            # Create dataframe
8655            dataframe_barcode = self.get_query_to_df(
8656                f""" SELECT {samples_fields} FROM {table_variants} """
8657            )
8658
8659            # Create barcode column
8660            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8661                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
8662            )
8663
8664            # Add barcode to header
8665            vcf_reader.infos[tag] = vcf.parser._Info(
8666                tag,
8667                ".",
8668                "String",
8669                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
8670                "howard calculation",
8671                "0",
8672                self.code_type_map.get("String"),
8673            )
8674
8675            # Update
8676            sql_update = f"""
8677                UPDATE {table_variants}
8678                SET "INFO" = 
8679                    concat(
8680                        CASE
8681                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8682                            THEN ''
8683                            ELSE concat("INFO", ';')
8684                        END,
8685                        CASE
8686                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
8687                            AND dataframe_barcode."{barcode_infos}" NOT NULL
8688                            THEN concat(
8689                                    '{tag}=',
8690                                    dataframe_barcode."{barcode_infos}"
8691                                )
8692                            ELSE ''
8693                        END
8694                    )
8695                FROM dataframe_barcode
8696                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8697            """
8698            self.conn.execute(sql_update)
8699
8700            # Remove added columns
8701            for added_column in added_columns:
8702                self.drop_column(column=added_column)
8703
8704            # Delete dataframe
8705            del dataframe_barcode
8706            gc.collect()

The calculation_barcode function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode function is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
def calculation_barcode_family(self, tag: str = 'BCF') -> None:
8708    def calculation_barcode_family(self, tag: str = "BCF") -> None:
8709        """
8710        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
8711        and updates the INFO field in the file with the calculated barcode values.
8712
8713        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
8714        the barcode tag that will be added to the VCF file during the calculation process. If no value
8715        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
8716        :type tag: str (optional)
8717        """
8718
8719        # if FORMAT and samples
8720        if (
8721            "FORMAT" in self.get_header_columns_as_list()
8722            and self.get_header_sample_list()
8723        ):
8724
8725            # barcode annotation field
8726            if not tag:
8727                tag = "BCF"
8728
8729            # VCF infos tags
8730            vcf_infos_tags = {
8731                tag: "barcode family calculation",
8732                f"{tag}S": "barcode family samples",
8733            }
8734
8735            # Param
8736            param = self.get_param()
8737            log.debug(f"param={param}")
8738
8739            # Prefix
8740            prefix = self.get_explode_infos_prefix()
8741
8742            # PED param
8743            ped = (
8744                param.get("calculation", {})
8745                .get("calculations", {})
8746                .get("BARCODEFAMILY", {})
8747                .get("family_pedigree", None)
8748            )
8749            log.debug(f"ped={ped}")
8750
8751            # Load PED
8752            if ped:
8753
8754                # Pedigree is a file
8755                if isinstance(ped, str) and os.path.exists(full_path(ped)):
8756                    log.debug("Pedigree is file")
8757                    with open(full_path(ped)) as ped:
8758                        ped = json.load(ped)
8759
8760                # Pedigree is a string
8761                elif isinstance(ped, str):
8762                    log.debug("Pedigree is str")
8763                    try:
8764                        ped = json.loads(ped)
8765                        log.debug("Pedigree is json str")
8766                    except ValueError as e:
8767                        ped_samples = ped.split(",")
8768                        ped = {}
8769                        for ped_sample in ped_samples:
8770                            ped[ped_sample] = ped_sample
8771
8772                # Pedigree is a dict
8773                elif isinstance(ped, dict):
8774                    log.debug("Pedigree is dict")
8775
8776                # Pedigree is not well formatted
8777                else:
8778                    msg_error = "Pedigree not well formatted"
8779                    log.error(msg_error)
8780                    raise ValueError(msg_error)
8781
8782                # Construct list
8783                ped_samples = list(ped.values())
8784
8785            else:
8786                log.debug("Pedigree not defined. Take all samples")
8787                ped_samples = self.get_header_sample_list()
8788                ped = {}
8789                for ped_sample in ped_samples:
8790                    ped[ped_sample] = ped_sample
8791
8792            # Check pedigree
8793            if not ped or len(ped) == 0:
8794                msg_error = f"Error in pedigree: samples {ped_samples}"
8795                log.error(msg_error)
8796                raise ValueError(msg_error)
8797
8798            # Log
8799            log.info(
8800                "Calculation 'BARCODEFAMILY' - Samples: "
8801                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
8802            )
8803            log.debug(f"ped_samples={ped_samples}")
8804
8805            # Field
8806            barcode_infos = prefix + tag
8807
8808            # Variants table
8809            table_variants = self.get_table_variants()
8810
8811            # Header
8812            vcf_reader = self.get_header()
8813
8814            # Create variant id
8815            variant_id_column = self.get_variant_id_column()
8816            added_columns = [variant_id_column]
8817
8818            # variant_id, FORMAT and samples
8819            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8820                ped_samples
8821            )
8822
8823            # Create dataframe
8824            dataframe_barcode = self.get_query_to_df(
8825                f""" SELECT {samples_fields} FROM {table_variants} """
8826            )
8827
8828            # Create barcode column
8829            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8830                lambda row: barcode(row, samples=ped_samples), axis=1
8831            )
8832
8833            # Add barcode family to header
8834            # Add vaf_normalization to header
8835            vcf_reader.formats[tag] = vcf.parser._Format(
8836                id=tag,
8837                num=".",
8838                type="String",
8839                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
8840                type_code=self.code_type_map.get("String"),
8841            )
8842            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
8843                id=f"{tag}S",
8844                num=".",
8845                type="String",
8846                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
8847                type_code=self.code_type_map.get("String"),
8848            )
8849
8850            # Update
8851            # for sample in ped_samples:
8852            sql_update_set = []
8853            for sample in self.get_header_sample_list() + ["FORMAT"]:
8854                if sample in ped_samples:
8855                    value = f'dataframe_barcode."{barcode_infos}"'
8856                    value_samples = "'" + ",".join(ped_samples) + "'"
8857                elif sample == "FORMAT":
8858                    value = f"'{tag}'"
8859                    value_samples = f"'{tag}S'"
8860                else:
8861                    value = "'.'"
8862                    value_samples = "'.'"
8863                format_regex = r"[a-zA-Z0-9\s]"
8864                sql_update_set.append(
8865                    f"""
8866                        "{sample}" = 
8867                        concat(
8868                            CASE
8869                                WHEN {table_variants}."{sample}" = './.'
8870                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
8871                                ELSE {table_variants}."{sample}"
8872                            END,
8873                            ':',
8874                            {value},
8875                            ':',
8876                            {value_samples}
8877                        )
8878                    """
8879                )
8880
8881            sql_update_set_join = ", ".join(sql_update_set)
8882            sql_update = f"""
8883                UPDATE {table_variants}
8884                SET {sql_update_set_join}
8885                FROM dataframe_barcode
8886                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8887            """
8888            self.conn.execute(sql_update)
8889
8890            # Remove added columns
8891            for added_column in added_columns:
8892                self.drop_column(column=added_column)
8893
8894            # Delete dataframe
8895            del dataframe_barcode
8896            gc.collect()

The calculation_barcode_family function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode_family function is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for the tag parameter, the default value used is "BCF", defaults to BCF
def calculation_trio(self) -> None:
8898    def calculation_trio(self) -> None:
8899        """
8900        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
8901        information to the INFO field of each variant.
8902        """
8903
8904        # if FORMAT and samples
8905        if (
8906            "FORMAT" in self.get_header_columns_as_list()
8907            and self.get_header_sample_list()
8908        ):
8909
8910            # trio annotation field
8911            trio_tag = "trio"
8912
8913            # VCF infos tags
8914            vcf_infos_tags = {
8915                "trio": "trio calculation",
8916            }
8917
8918            # Param
8919            param = self.get_param()
8920
8921            # Prefix
8922            prefix = self.get_explode_infos_prefix()
8923
8924            # Trio param
8925            trio_ped = (
8926                param.get("calculation", {})
8927                .get("calculations", {})
8928                .get("TRIO", {})
8929                .get("trio_pedigree", None)
8930            )
8931
8932            # Load trio
8933            if trio_ped:
8934
8935                # Trio pedigree is a file
8936                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
8937                    log.debug("TRIO pedigree is file")
8938                    with open(full_path(trio_ped)) as trio_ped:
8939                        trio_ped = json.load(trio_ped)
8940
8941                # Trio pedigree is a string
8942                elif isinstance(trio_ped, str):
8943                    log.debug("TRIO pedigree is str")
8944                    try:
8945                        trio_ped = json.loads(trio_ped)
8946                        log.debug("TRIO pedigree is json str")
8947                    except ValueError as e:
8948                        trio_samples = trio_ped.split(",")
8949                        if len(trio_samples) == 3:
8950                            trio_ped = {
8951                                "father": trio_samples[0],
8952                                "mother": trio_samples[1],
8953                                "child": trio_samples[2],
8954                            }
8955                            log.debug("TRIO pedigree is list str")
8956                        else:
8957                            msg_error = "TRIO pedigree not well formatted"
8958                            log.error(msg_error)
8959                            raise ValueError(msg_error)
8960
8961                # Trio pedigree is a dict
8962                elif isinstance(trio_ped, dict):
8963                    log.debug("TRIO pedigree is dict")
8964
8965                # Trio pedigree is not well formatted
8966                else:
8967                    msg_error = "TRIO pedigree not well formatted"
8968                    log.error(msg_error)
8969                    raise ValueError(msg_error)
8970
8971                # Construct trio list
8972                trio_samples = [
8973                    trio_ped.get("father", ""),
8974                    trio_ped.get("mother", ""),
8975                    trio_ped.get("child", ""),
8976                ]
8977
8978            else:
8979                log.debug("TRIO pedigree not defined. Take the first 3 samples")
8980                samples_list = self.get_header_sample_list()
8981                if len(samples_list) >= 3:
8982                    trio_samples = self.get_header_sample_list()[0:3]
8983                    trio_ped = {
8984                        "father": trio_samples[0],
8985                        "mother": trio_samples[1],
8986                        "child": trio_samples[2],
8987                    }
8988                else:
8989                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
8990                    log.error(msg_error)
8991                    raise ValueError(msg_error)
8992
8993            # Check trio pedigree
8994            if not trio_ped or len(trio_ped) != 3:
8995                msg_error = f"Error in TRIO pedigree: {trio_ped}"
8996                log.error(msg_error)
8997                raise ValueError(msg_error)
8998
8999            # Log
9000            log.info(
9001                f"Calculation 'TRIO' - Samples: "
9002                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
9003            )
9004
9005            # Field
9006            trio_infos = prefix + trio_tag
9007
9008            # Variants table
9009            table_variants = self.get_table_variants()
9010
9011            # Header
9012            vcf_reader = self.get_header()
9013
9014            # Create variant id
9015            variant_id_column = self.get_variant_id_column()
9016            added_columns = [variant_id_column]
9017
9018            # variant_id, FORMAT and samples
9019            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9020                self.get_header_sample_list()
9021            )
9022
9023            # Create dataframe
9024            dataframe_trio = self.get_query_to_df(
9025                f""" SELECT {samples_fields} FROM {table_variants} """
9026            )
9027
9028            # Create trio column
9029            dataframe_trio[trio_infos] = dataframe_trio.apply(
9030                lambda row: trio(row, samples=trio_samples), axis=1
9031            )
9032
9033            # Add trio to header
9034            vcf_reader.infos[trio_tag] = vcf.parser._Info(
9035                trio_tag,
9036                ".",
9037                "String",
9038                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
9039                "howard calculation",
9040                "0",
9041                self.code_type_map.get("String"),
9042            )
9043
9044            # Update
9045            sql_update = f"""
9046                UPDATE {table_variants}
9047                SET "INFO" = 
9048                    concat(
9049                        CASE
9050                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9051                            THEN ''
9052                            ELSE concat("INFO", ';')
9053                        END,
9054                        CASE
9055                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
9056                             AND dataframe_trio."{trio_infos}" NOT NULL
9057                            THEN concat(
9058                                    '{trio_tag}=',
9059                                    dataframe_trio."{trio_infos}"
9060                                )
9061                            ELSE ''
9062                        END
9063                    )
9064                FROM dataframe_trio
9065                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
9066            """
9067            self.conn.execute(sql_update)
9068
9069            # Remove added columns
9070            for added_column in added_columns:
9071                self.drop_column(column=added_column)
9072
9073            # Delete dataframe
9074            del dataframe_trio
9075            gc.collect()

The calculation_trio function performs trio calculations on a VCF file by adding trio information to the INFO field of each variant.

def calculation_vaf_normalization(self) -> None:
9077    def calculation_vaf_normalization(self) -> None:
9078        """
9079        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
9080        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
9081        :return: The function does not return anything.
9082        """
9083
9084        # if FORMAT and samples
9085        if (
9086            "FORMAT" in self.get_header_columns_as_list()
9087            and self.get_header_sample_list()
9088        ):
9089
9090            # vaf_normalization annotation field
9091            vaf_normalization_tag = "VAF"
9092
9093            # VCF infos tags
9094            vcf_infos_tags = {
9095                "VAF": "VAF Variant Frequency",
9096            }
9097
9098            # Prefix
9099            prefix = self.get_explode_infos_prefix()
9100
9101            # Variants table
9102            table_variants = self.get_table_variants()
9103
9104            # Header
9105            vcf_reader = self.get_header()
9106
9107            # Do not calculate if VAF already exists
9108            if "VAF" in vcf_reader.formats:
9109                log.debug("VAF already on genotypes")
9110                return
9111
9112            # Create variant id
9113            variant_id_column = self.get_variant_id_column()
9114            added_columns = [variant_id_column]
9115
9116            # variant_id, FORMAT and samples
9117            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9118                f""" "{sample}" """ for sample in self.get_header_sample_list()
9119            )
9120
9121            # Create dataframe
9122            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
9123            log.debug(f"query={query}")
9124            dataframe_vaf_normalization = self.get_query_to_df(query=query)
9125
9126            vaf_normalization_set = []
9127
9128            # for each sample vaf_normalization
9129            for sample in self.get_header_sample_list():
9130                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
9131                    lambda row: vaf_normalization(row, sample=sample), axis=1
9132                )
9133                vaf_normalization_set.append(
9134                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
9135                )
9136
9137            # Add VAF to FORMAT
9138            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
9139                "FORMAT"
9140            ].apply(lambda x: str(x) + ":VAF")
9141            vaf_normalization_set.append(
9142                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
9143            )
9144
9145            # Add vaf_normalization to header
9146            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
9147                id=vaf_normalization_tag,
9148                num="1",
9149                type="Float",
9150                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
9151                type_code=self.code_type_map.get("Float"),
9152            )
9153
9154            # Create fields to add in INFO
9155            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
9156
9157            # Update
9158            sql_update = f"""
9159                UPDATE {table_variants}
9160                SET {sql_vaf_normalization_set}
9161                FROM dataframe_vaf_normalization
9162                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
9163
9164            """
9165            self.conn.execute(sql_update)
9166
9167            # Remove added columns
9168            for added_column in added_columns:
9169                self.drop_column(column=added_column)
9170
9171            # Delete dataframe
9172            del dataframe_vaf_normalization
9173            gc.collect()

The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency) normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.

Returns

The function does not return anything.

def calculation_genotype_stats(self, info: str = 'VAF') -> None:
9175    def calculation_genotype_stats(self, info: str = "VAF") -> None:
9176        """
9177        The `calculation_genotype_stats` function calculates genotype statistics for a given information
9178        field in a VCF file and updates the INFO column of the variants table with the calculated
9179        statistics.
9180
9181        :param info: The `info` parameter is a string that represents the type of information for which
9182        genotype statistics are calculated. It is used to generate various VCF info tags for the
9183        statistics, such as the number of occurrences, the list of values, the minimum value, the
9184        maximum value, the mean, the median, defaults to VAF
9185        :type info: str (optional)
9186        """
9187
9188        # if FORMAT and samples
9189        if (
9190            "FORMAT" in self.get_header_columns_as_list()
9191            and self.get_header_sample_list()
9192        ):
9193
9194            # vaf_stats annotation field
9195            vaf_stats_tag = info + "_stats"
9196
9197            # VCF infos tags
9198            vcf_infos_tags = {
9199                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
9200                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
9201                info + "_stats_min": f"genotype {info} Statistics - min {info}",
9202                info + "_stats_max": f"genotype {info} Statistics - max {info}",
9203                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
9204                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
9205                info
9206                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
9207            }
9208
9209            # Prefix
9210            prefix = self.get_explode_infos_prefix()
9211
9212            # Field
9213            vaf_stats_infos = prefix + vaf_stats_tag
9214
9215            # Variants table
9216            table_variants = self.get_table_variants()
9217
9218            # Header
9219            vcf_reader = self.get_header()
9220
9221            # Create variant id
9222            variant_id_column = self.get_variant_id_column()
9223            added_columns = [variant_id_column]
9224
9225            # variant_id, FORMAT and samples
9226            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9227                self.get_header_sample_list()
9228            )
9229
9230            # Create dataframe
9231            dataframe_vaf_stats = self.get_query_to_df(
9232                f""" SELECT {samples_fields} FROM {table_variants} """
9233            )
9234
9235            # Create vaf_stats column
9236            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
9237                lambda row: genotype_stats(
9238                    row, samples=self.get_header_sample_list(), info=info
9239                ),
9240                axis=1,
9241            )
9242
9243            # List of vcf tags
9244            sql_vaf_stats_fields = []
9245
9246            # Check all VAF stats infos
9247            for stat in vcf_infos_tags:
9248
9249                # Extract stats
9250                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
9251                    lambda x: dict(x).get(stat, "")
9252                )
9253
9254                # Add snpeff_hgvs to header
9255                vcf_reader.infos[stat] = vcf.parser._Info(
9256                    stat,
9257                    ".",
9258                    "String",
9259                    vcf_infos_tags.get(stat, "genotype statistics"),
9260                    "howard calculation",
9261                    "0",
9262                    self.code_type_map.get("String"),
9263                )
9264
9265                if len(sql_vaf_stats_fields):
9266                    sep = ";"
9267                else:
9268                    sep = ""
9269
9270                # Create fields to add in INFO
9271                sql_vaf_stats_fields.append(
9272                    f"""
9273                        CASE
9274                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
9275                            THEN concat(
9276                                    '{sep}{stat}=',
9277                                    dataframe_vaf_stats."{stat}"
9278                                )
9279                            ELSE ''
9280                        END
9281                    """
9282                )
9283
9284            # SQL set for update
9285            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
9286
9287            # Update
9288            sql_update = f"""
9289                UPDATE {table_variants}
9290                SET "INFO" = 
9291                    concat(
9292                        CASE
9293                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9294                            THEN ''
9295                            ELSE concat("INFO", ';')
9296                        END,
9297                        {sql_vaf_stats_fields_set}
9298                    )
9299                FROM dataframe_vaf_stats
9300                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
9301
9302            """
9303            self.conn.execute(sql_update)
9304
9305            # Remove added columns
9306            for added_column in added_columns:
9307                self.drop_column(column=added_column)
9308
9309            # Delete dataframe
9310            del dataframe_vaf_stats
9311            gc.collect()

The calculation_genotype_stats function calculates genotype statistics for a given information field in a VCF file and updates the INFO column of the variants table with the calculated statistics.

Parameters
  • info: The info parameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
def calculation_transcripts_annotation(self, info_json: str = None, info_format: str = None) -> None:
9313    def calculation_transcripts_annotation(
9314        self, info_json: str = None, info_format: str = None
9315    ) -> None:
9316        """
9317        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
9318        field to it if transcripts are available.
9319
9320        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
9321        is a string parameter that represents the information field to be used in the transcripts JSON.
9322        It is used to specify the JSON format for the transcripts information. If no value is provided
9323        when calling the method, it defaults to "
9324        :type info_json: str
9325        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
9326        method is a string parameter that specifies the format of the information field to be used in
9327        the transcripts JSON. It is used to define the format of the information field
9328        :type info_format: str
9329        """
9330
9331        # Create transcripts table
9332        transcripts_table = self.create_transcript_view()
9333
9334        # Add info field
9335        if transcripts_table:
9336            self.transcript_view_to_variants(
9337                transcripts_table=transcripts_table,
9338                transcripts_info_field_json=info_json,
9339                transcripts_info_field_format=info_format,
9340            )
9341        else:
9342            log.info("No Transcripts to process. Check param.json file configuration")

The calculation_transcripts_annotation function creates a transcripts table and adds an info field to it if transcripts are available.

Parameters
  • info_json: The info_json parameter in the calculation_transcripts_annotation method is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to "
  • info_format: The info_format parameter in the calculation_transcripts_annotation method is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
def calculation_transcripts_prioritization(self) -> None:
9344    def calculation_transcripts_prioritization(self) -> None:
9345        """
9346        The function `calculation_transcripts_prioritization` creates a transcripts table and
9347        prioritizes transcripts based on certain criteria.
9348        """
9349
9350        # Create transcripts table
9351        transcripts_table = self.create_transcript_view()
9352
9353        # Add info field
9354        if transcripts_table:
9355            self.transcripts_prioritization(transcripts_table=transcripts_table)
9356        else:
9357            log.info("No Transcripts to process. Check param.json file configuration")

The function calculation_transcripts_prioritization creates a transcripts table and prioritizes transcripts based on certain criteria.

def transcripts_prioritization(self, transcripts_table: str = None, param: dict = {}) -> bool:
9363    def transcripts_prioritization(
9364        self, transcripts_table: str = None, param: dict = {}
9365    ) -> bool:
9366        """
9367        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
9368        and updates the variants table with the prioritized information.
9369
9370        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
9371        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
9372        This parameter is used to identify the table where the transcripts data is stored for the
9373        prioritization process
9374        :type transcripts_table: str
9375        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
9376        that contains various configuration settings for the prioritization process of transcripts. It
9377        is used to customize the behavior of the prioritization algorithm and includes settings such as
9378        the prefix for prioritization fields, default profiles, and other
9379        :type param: dict
9380        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
9381        transcripts prioritization process is successfully completed, and `False` if there are any
9382        issues or if no profile is defined for transcripts prioritization.
9383        """
9384
9385        log.debug("Start transcripts prioritization...")
9386
9387        # Param
9388        if not param:
9389            param = self.get_param()
9390
9391        # Variants table
9392        table_variants = self.get_table_variants()
9393        log.debug(f"transcripts_table={transcripts_table}")
9394        # Transcripts table
9395        if transcripts_table is None:
9396            log.debug(f"transcripts_table={transcripts_table}")
9397            transcripts_table = self.create_transcript_view(
9398                transcripts_table="transcripts", param=param
9399            )
9400            log.debug(f"transcripts_table={transcripts_table}")
9401        if transcripts_table is None:
9402            msg_err = "No Transcripts table availalble"
9403            log.error(msg_err)
9404            raise ValueError(msg_err)
9405
9406        # Get transcripts columns
9407        columns_as_list_query = f"""
9408            DESCRIBE {transcripts_table}
9409        """
9410        columns_as_list = list(
9411            self.get_query_to_df(columns_as_list_query)["column_name"]
9412        )
9413
9414        # Create INFO if not exists
9415        if "INFO" not in columns_as_list:
9416            query_add_info = f"""
9417                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
9418            """
9419            self.execute_query(query_add_info)
9420
9421        # Prioritization param and Force only PZ Score and Flag
9422        pz_param = param.get("transcripts", {}).get("prioritization", {})
9423        pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score"
9424        pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag"
9425        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
9426        pz_param["pzfields"] = [pz_fields_score, pz_fields_flag]
9427        pz_profile_default = (
9428            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
9429        )
9430
9431        # Exit if no profile
9432        if pz_profile_default is None:
9433            log.warning("No profile defined for transcripts prioritization")
9434            return False
9435
9436        # Prioritization
9437        prioritization_result = self.prioritization(
9438            table=transcripts_table,
9439            pz_param=param.get("transcripts", {}).get("prioritization", {}),
9440        )
9441        if not prioritization_result:
9442            log.warning("Transcripts prioritization not processed")
9443            return False
9444
9445        # Explode PZ fields
9446        self.explode_infos(
9447            table=transcripts_table,
9448            fields=param.get("transcripts", {})
9449            .get("prioritization", {})
9450            .get("pzfields", []),
9451        )
9452
9453        # Export Transcripts prioritization infos to variants table
9454        query_update = f"""
9455            WITH RankedTranscripts AS (
9456                SELECT
9457                    "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag},
9458                    ROW_NUMBER() OVER (
9459                        PARTITION BY "#CHROM", POS, REF, ALT
9460                        ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC
9461                    ) AS rn
9462                FROM
9463                    {transcripts_table}
9464            )
9465            UPDATE {table_variants}
9466                SET
9467                INFO = CONCAT(CASE
9468                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9469                            THEN ''
9470                            ELSE concat("INFO", ';')
9471                        END,
9472                        concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag})
9473                        )
9474            FROM
9475                RankedTranscripts
9476            WHERE
9477                rn = 1
9478                AND variants."#CHROM" = RankedTranscripts."#CHROM"
9479                AND variants."POS" = RankedTranscripts."POS"
9480                AND variants."REF" = RankedTranscripts."REF"
9481                AND variants."ALT" = RankedTranscripts."ALT"
9482                
9483        """
9484        self.execute_query(query=query_update)
9485
9486        # Add PZ Transcript in header
9487        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
9488            pz_fields_transcripts,
9489            ".",
9490            "String",
9491            f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}",
9492            "unknown",
9493            "unknown",
9494            code_type_map["String"],
9495        )
9496
9497        # Return
9498        return True

The transcripts_prioritization function prioritizes transcripts based on certain parameters and updates the variants table with the prioritized information.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process
  • param: The param parameter in the transcripts_prioritization method is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns

The function transcripts_prioritization returns a boolean value True if the transcripts prioritization process is successfully completed, and False if there are any issues or if no profile is defined for transcripts prioritization.

def create_transcript_view_from_columns_map( self, transcripts_table: str = 'transcripts', columns_maps: dict = {}, added_columns: list = [], temporary_tables: list = None, annotation_fields: list = None) -> tuple[list, list, list]:
9500    def create_transcript_view_from_columns_map(
9501        self,
9502        transcripts_table: str = "transcripts",
9503        columns_maps: dict = {},
9504        added_columns: list = [],
9505        temporary_tables: list = None,
9506        annotation_fields: list = None,
9507    ) -> tuple[list, list, list]:
9508        """
9509        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
9510        specified columns mapping for transcripts data.
9511
9512        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
9513        the table where the transcripts data is stored or will be stored in the database. This table
9514        typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores,
9515        predictions, etc. It defaults to "transcripts, defaults to transcripts
9516        :type transcripts_table: str (optional)
9517        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about
9518        how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list
9519        represents a mapping configuration for a specific set of columns. It typically includes details such
9520        as the main transcript column and additional information columns
9521        :type columns_maps: dict
9522        :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map`
9523        function is a list that stores the additional columns that will be added to the view being created
9524        based on the columns map provided. These columns are generated by exploding the transcript
9525        information columns along with the main transcript column
9526        :type added_columns: list
9527        :param temporary_tables: The `temporary_tables` parameter in the
9528        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
9529        tables created during the process of creating a transcript view from a columns map. These temporary
9530        tables are used to store intermediate results or transformations before the final view is generated
9531        :type temporary_tables: list
9532        :param annotation_fields: The `annotation_fields` parameter in the
9533        `create_transcript_view_from_columns_map` function is a list that stores the fields that are used
9534        for annotation in the query view creation process. These fields are extracted from the
9535        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
9536        :type annotation_fields: list
9537        :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three
9538        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
9539        """
9540
9541        log.debug("Start transcrpts view creation from columns map...")
9542
9543        # "from_columns_map": [
9544        #     {
9545        #         "transcripts_column": "Ensembl_transcriptid",
9546        #         "transcripts_infos_columns": [
9547        #             "genename",
9548        #             "Ensembl_geneid",
9549        #             "LIST_S2_score",
9550        #             "LIST_S2_pred",
9551        #         ],
9552        #     },
9553        #     {
9554        #         "transcripts_column": "Ensembl_transcriptid",
9555        #         "transcripts_infos_columns": [
9556        #             "genename",
9557        #             "VARITY_R_score",
9558        #             "Aloft_pred",
9559        #         ],
9560        #     },
9561        # ],
9562
9563        # Init
9564        if temporary_tables is None:
9565            temporary_tables = []
9566        if annotation_fields is None:
9567            annotation_fields = []
9568
9569        # Variants table
9570        table_variants = self.get_table_variants()
9571
9572        for columns_map in columns_maps:
9573
9574            # Transcript column
9575            transcripts_column = columns_map.get("transcripts_column", None)
9576
9577            # Transcripts infos columns
9578            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
9579
9580            if transcripts_column is not None:
9581
9582                # Explode
9583                added_columns += self.explode_infos(
9584                    fields=[transcripts_column] + transcripts_infos_columns
9585                )
9586
9587                # View clauses
9588                clause_select = []
9589                for field in [transcripts_column] + transcripts_infos_columns:
9590                    clause_select.append(
9591                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
9592                    )
9593                    if field not in [transcripts_column]:
9594                        annotation_fields.append(field)
9595
9596                # Querey View
9597                query = f""" 
9598                    SELECT
9599                        "#CHROM", POS, REF, ALT,
9600                        "{transcripts_column}" AS 'transcript',
9601                        {", ".join(clause_select)}
9602                    FROM (
9603                        SELECT 
9604                            "#CHROM", POS, REF, ALT,
9605                            {", ".join(clause_select)}
9606                        FROM {table_variants}
9607                        )
9608                    WHERE "{transcripts_column}" IS NOT NULL
9609                """
9610
9611                # Create temporary table
9612                temporary_table = transcripts_table + "".join(
9613                    random.choices(string.ascii_uppercase + string.digits, k=10)
9614                )
9615
9616                # Temporary_tables
9617                temporary_tables.append(temporary_table)
9618                query_view = f"""
9619                    CREATE TEMPORARY TABLE {temporary_table}
9620                    AS ({query})
9621                """
9622                self.execute_query(query=query_view)
9623
9624        return added_columns, temporary_tables, annotation_fields

The create_transcript_view_from_columns_map function generates a temporary table view based on specified columns mapping for transcripts data.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
  • columns_maps: The columns_maps parameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in the columns_maps list represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns
  • added_columns: The added_columns parameter in the create_transcript_view_from_columns_map function is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_columns_map function is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_columns_map function is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from the transcripts_column and transcripts_infos_columns specified in the `columns
Returns

The function create_transcript_view_from_columns_map returns a tuple containing three lists: added_columns, temporary_tables, and annotation_fields.

def create_transcript_view_from_column_format( self, transcripts_table: str = 'transcripts', column_formats: dict = {}, temporary_tables: list = None, annotation_fields: list = None) -> tuple[list, list, list]:
9626    def create_transcript_view_from_column_format(
9627        self,
9628        transcripts_table: str = "transcripts",
9629        column_formats: dict = {},
9630        temporary_tables: list = None,
9631        annotation_fields: list = None,
9632    ) -> tuple[list, list, list]:
9633        """
9634        The `create_transcript_view_from_column_format` function generates a transcript view based on
9635        specified column formats, adds additional columns and annotation fields, and returns the list of
9636        temporary tables and annotation fields.
9637
9638        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
9639        the table containing the transcripts data. This table will be used as the base table for creating
9640        the transcript view. The default value for this parameter is "transcripts", but you can provide a
9641        different table name if needed, defaults to transcripts
9642        :type transcripts_table: str (optional)
9643        :param column_formats: The `column_formats` parameter is a dictionary that contains information
9644        about the columns to be used for creating the transcript view. Each entry in the dictionary
9645        specifies the mapping between a transcripts column and a transcripts infos column. For example, in
9646        the provided code snippet:
9647        :type column_formats: dict
9648        :param temporary_tables: The `temporary_tables` parameter in the
9649        `create_transcript_view_from_column_format` function is a list that stores the names of temporary
9650        views created during the process of creating a transcript view from a column format. These temporary
9651        views are used to manipulate and extract data before generating the final transcript view. It
9652        :type temporary_tables: list
9653        :param annotation_fields: The `annotation_fields` parameter in the
9654        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
9655        that are extracted from the temporary views created during the process. These annotation fields are
9656        obtained by querying the temporary views and extracting the column names excluding specific columns
9657        like `#CH
9658        :type annotation_fields: list
9659        :return: The `create_transcript_view_from_column_format` function returns two lists:
9660        `temporary_tables` and `annotation_fields`.
9661        """
9662
9663        log.debug("Start transcrpts view creation from column format...")
9664
9665        #  "from_column_format": [
9666        #     {
9667        #         "transcripts_column": "ANN",
9668        #         "transcripts_infos_column": "Feature_ID",
9669        #     }
9670        # ],
9671
9672        # Init
9673        if temporary_tables is None:
9674            temporary_tables = []
9675        if annotation_fields is None:
9676            annotation_fields = []
9677
9678        for column_format in column_formats:
9679
9680            # annotation field and transcript annotation field
9681            annotation_field = column_format.get("transcripts_column", "ANN")
9682            transcript_annotation = column_format.get(
9683                "transcripts_infos_column", "Feature_ID"
9684            )
9685
9686            # Temporary View name
9687            temporary_view_name = transcripts_table + "".join(
9688                random.choices(string.ascii_uppercase + string.digits, k=10)
9689            )
9690
9691            # Create temporary view name
9692            temporary_view_name = self.annotation_format_to_table(
9693                uniquify=True,
9694                annotation_field=annotation_field,
9695                view_name=temporary_view_name,
9696                annotation_id=transcript_annotation,
9697            )
9698
9699            # Annotation fields
9700            if temporary_view_name:
9701                query_annotation_fields = f"""
9702                    SELECT *
9703                    FROM (
9704                        DESCRIBE SELECT *
9705                        FROM {temporary_view_name}
9706                        )
9707                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
9708                """
9709                df_annotation_fields = self.get_query_to_df(
9710                    query=query_annotation_fields
9711                )
9712
9713                # Add temporary view and annotation fields
9714                temporary_tables.append(temporary_view_name)
9715                annotation_fields += list(set(df_annotation_fields["column_name"]))
9716
9717        return temporary_tables, annotation_fields

The create_transcript_view_from_column_format function generates a transcript view based on specified column formats, adds additional columns and annotation fields, and returns the list of temporary tables and annotation fields.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts
  • column_formats: The column_formats parameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. For example, in the provided code snippet:
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_column_format function is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view. It
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_column_format function is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH
Returns

The create_transcript_view_from_column_format function returns two lists: temporary_tables and annotation_fields.

def create_transcript_view( self, transcripts_table: str = None, transcripts_table_drop: bool = True, param: dict = {}) -> str:
9719    def create_transcript_view(
9720        self,
9721        transcripts_table: str = None,
9722        transcripts_table_drop: bool = True,
9723        param: dict = {},
9724    ) -> str:
9725        """
9726        The `create_transcript_view` function generates a transcript view by processing data from a
9727        specified table based on provided parameters and structural information.
9728
9729        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
9730        is used to specify the name of the table that will store the final transcript view data. If a table
9731        name is not provided, the function will create a new table to store the transcript view data, and by
9732        default,, defaults to transcripts
9733        :type transcripts_table: str (optional)
9734        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
9735        `create_transcript_view` function is a boolean parameter that determines whether to drop the
9736        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
9737        the function will drop the existing transcripts table if it exists, defaults to True
9738        :type transcripts_table_drop: bool (optional)
9739        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
9740        contains information needed to create a transcript view. It includes details such as the structure
9741        of the transcripts, columns mapping, column formats, and other necessary information for generating
9742        the view. This parameter allows for flexibility and customization
9743        :type param: dict
9744        :return: The `create_transcript_view` function returns the name of the transcripts table that was
9745        created or modified during the execution of the function.
9746        """
9747
9748        log.debug("Start transcripts view creation...")
9749
9750        # Default
9751        transcripts_table_default = "transcripts"
9752
9753        # Param
9754        if not param:
9755            param = self.get_param()
9756
9757        # Struct
9758        struct = param.get("transcripts", {}).get("struct", None)
9759
9760        if struct:
9761
9762            # Transcripts table
9763            if transcripts_table is None:
9764                transcripts_table = param.get("transcripts", {}).get(
9765                    "table", transcripts_table_default
9766                )
9767
9768            # added_columns
9769            added_columns = []
9770
9771            # Temporary tables
9772            temporary_tables = []
9773
9774            # Annotation fields
9775            annotation_fields = []
9776
9777            # from columns map
9778            columns_maps = struct.get("from_columns_map", [])
9779            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
9780                self.create_transcript_view_from_columns_map(
9781                    transcripts_table=transcripts_table,
9782                    columns_maps=columns_maps,
9783                    added_columns=added_columns,
9784                    temporary_tables=temporary_tables,
9785                    annotation_fields=annotation_fields,
9786                )
9787            )
9788            added_columns += added_columns_tmp
9789            temporary_tables += temporary_tables_tmp
9790            annotation_fields += annotation_fields_tmp
9791
9792            # from column format
9793            column_formats = struct.get("from_column_format", [])
9794            temporary_tables_tmp, annotation_fields_tmp = (
9795                self.create_transcript_view_from_column_format(
9796                    transcripts_table=transcripts_table,
9797                    column_formats=column_formats,
9798                    temporary_tables=temporary_tables,
9799                    annotation_fields=annotation_fields,
9800                )
9801            )
9802            temporary_tables += temporary_tables_tmp
9803            annotation_fields += annotation_fields_tmp
9804
9805            # Merge temporary tables query
9806            query_merge = ""
9807            for temporary_table in temporary_tables:
9808
9809                # First temporary table
9810                if not query_merge:
9811                    query_merge = f"""
9812                        SELECT * FROM {temporary_table}
9813                    """
9814                # other temporary table (using UNION)
9815                else:
9816                    query_merge += f"""
9817                        UNION BY NAME SELECT * FROM {temporary_table}
9818                    """
9819
9820            # Merge on transcript
9821            query_merge_on_transcripts_annotation_fields = []
9822            # Aggregate all annotations fields
9823            for annotation_field in set(annotation_fields):
9824                query_merge_on_transcripts_annotation_fields.append(
9825                    f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """
9826                )
9827            # Query for transcripts view
9828            query_merge_on_transcripts = f"""
9829                SELECT "#CHROM", POS, REF, ALT, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)}
9830                FROM ({query_merge})
9831                GROUP BY "#CHROM", POS, REF, ALT, transcript
9832            """
9833
9834            # Drop transcript view is necessary
9835            if transcripts_table_drop:
9836                query_drop = f"""
9837                    DROP TABLE IF EXISTS {transcripts_table};
9838                """
9839                self.execute_query(query=query_drop)
9840
9841            # Merge and create transcript view
9842            query_create_view = f"""
9843                CREATE TABLE IF NOT EXISTS {transcripts_table}
9844                AS {query_merge_on_transcripts}
9845            """
9846            self.execute_query(query=query_create_view)
9847
9848            # Remove added columns
9849            for added_column in added_columns:
9850                self.drop_column(column=added_column)
9851
9852        else:
9853
9854            transcripts_table = None
9855
9856        return transcripts_table

The create_transcript_view function generates a transcript view by processing data from a specified table based on provided parameters and structural information.

Parameters
  • transcripts_table: The transcripts_table parameter in the create_transcript_view function is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts
  • transcripts_table_drop: The transcripts_table_drop parameter in the create_transcript_view function is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. If transcripts_table_drop is set to True, the function will drop the existing transcripts table if it exists, defaults to True
  • param: The param parameter in the create_transcript_view function is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns

The create_transcript_view function returns the name of the transcripts table that was created or modified during the execution of the function.

def annotation_format_to_table( self, uniquify: bool = True, annotation_field: str = 'ANN', annotation_id: str = 'Feature_ID', view_name: str = 'transcripts') -> str:
 9858    def annotation_format_to_table(
 9859        self,
 9860        uniquify: bool = True,
 9861        annotation_field: str = "ANN",
 9862        annotation_id: str = "Feature_ID",
 9863        view_name: str = "transcripts",
 9864    ) -> str:
 9865        """
 9866        The function `annotation_format_to_table` converts annotation data from a VCF file into a structured
 9867        table format.
 9868
 9869        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique
 9870        values in the output or not. If set to `True`, the function will make sure that the output values
 9871        are unique, defaults to True
 9872        :type uniquify: bool (optional)
 9873        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that
 9874        contains the annotation information for each variant. This field is used to extract the annotation
 9875        details for further processing in the function, defaults to ANN
 9876        :type annotation_field: str (optional)
 9877        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is
 9878        used to specify the identifier for the annotation feature. This identifier will be used as a column
 9879        name in the resulting table or view that is created based on the annotation data. It helps in
 9880        uniquely identifying each annotation entry in the, defaults to Feature_ID
 9881        :type annotation_id: str (optional)
 9882        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to
 9883        specify the name of the temporary table that will be created to store the transformed annotation
 9884        data. This table will hold the extracted information from the annotation field in a structured
 9885        format for further processing or analysis, defaults to transcripts
 9886        :type view_name: str (optional)
 9887        :return: The function `annotation_format_to_table` is returning the name of the view created, which
 9888        is stored in the variable `view_name`.
 9889        """
 9890
 9891        # Annotation field
 9892        annotation_format = "annotation_explode"
 9893
 9894        # Transcript annotation
 9895        annotation_id = "".join(char for char in annotation_id if char.isalnum())
 9896
 9897        # Prefix
 9898        prefix = self.get_explode_infos_prefix()
 9899        if prefix:
 9900            prefix = "INFO/"
 9901
 9902        # Annotation fields
 9903        annotation_infos = prefix + annotation_field
 9904        annotation_format_infos = prefix + annotation_format
 9905
 9906        # Variants table
 9907        table_variants = self.get_table_variants()
 9908
 9909        # Header
 9910        vcf_reader = self.get_header()
 9911
 9912        # Add columns
 9913        added_columns = []
 9914
 9915        # Explode HGVS field in column
 9916        added_columns += self.explode_infos(fields=[annotation_field])
 9917
 9918        if annotation_field in vcf_reader.infos:
 9919
 9920            # Extract ANN header
 9921            ann_description = vcf_reader.infos[annotation_field].desc
 9922            pattern = r"'(.+?)'"
 9923            match = re.search(pattern, ann_description)
 9924            if match:
 9925                ann_header_match = match.group(1).split(" | ")
 9926                ann_header = []
 9927                ann_header_desc = {}
 9928                for i in range(len(ann_header_match)):
 9929                    ann_header_info = "".join(
 9930                        char for char in ann_header_match[i] if char.isalnum()
 9931                    )
 9932                    ann_header.append(ann_header_info)
 9933                    ann_header_desc[ann_header_info] = ann_header_match[i]
 9934                if not ann_header_desc:
 9935                    raise ValueError("Invalid header description format")
 9936            else:
 9937                raise ValueError("Invalid header description format")
 9938
 9939            # Create variant id
 9940            variant_id_column = self.get_variant_id_column()
 9941            added_columns += [variant_id_column]
 9942
 9943            # Create dataframe
 9944            dataframe_annotation_format = self.get_query_to_df(
 9945                f""" SELECT "#CHROM", POS, REF, ALT, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
 9946            )
 9947
 9948            # Create annotation columns
 9949            dataframe_annotation_format[
 9950                annotation_format_infos
 9951            ] = dataframe_annotation_format[annotation_infos].apply(
 9952                lambda x: explode_annotation_format(
 9953                    annotation=str(x),
 9954                    uniquify=uniquify,
 9955                    output_format="JSON",
 9956                    prefix="",
 9957                    header=list(ann_header_desc.values()),
 9958                )
 9959            )
 9960
 9961            # Find keys
 9962            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
 9963            df_keys = self.get_query_to_df(query=query_json)
 9964
 9965            # Check keys
 9966            query_json_key = []
 9967            for _, row in df_keys.iterrows():
 9968
 9969                # Key
 9970                key = row.iloc[0]
 9971
 9972                # key_clean
 9973                key_clean = "".join(char for char in key if char.isalnum())
 9974
 9975                # Type
 9976                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
 9977
 9978                # Get DataFrame from query
 9979                df_json_type = self.get_query_to_df(query=query_json_type)
 9980
 9981                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
 9982                with pd.option_context("future.no_silent_downcasting", True):
 9983                    df_json_type.fillna(value="", inplace=True)
 9984                    replace_dict = {None: np.nan, "": np.nan}
 9985                    df_json_type.replace(replace_dict, inplace=True)
 9986                    df_json_type.dropna(inplace=True)
 9987
 9988                # Detect column type
 9989                column_type = detect_column_type(df_json_type[key_clean])
 9990
 9991                # Append
 9992                query_json_key.append(
 9993                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
 9994                )
 9995
 9996            # Create view
 9997            query_view = f"""CREATE TEMPORARY TABLE {view_name} AS (SELECT *, {annotation_id} AS 'transcript' FROM (SELECT "#CHROM", POS, REF, ALT, {",".join(query_json_key)} FROM dataframe_annotation_format));"""
 9998            self.execute_query(query=query_view)
 9999
10000        else:
10001
10002            # Return None
10003            view_name = None
10004
10005        # Remove added columns
10006        for added_column in added_columns:
10007            self.drop_column(column=added_column)
10008
10009        return view_name

The function annotation_format_to_table converts annotation data from a VCF file into a structured table format.

Parameters
  • uniquify: The uniquify parameter is a boolean flag that determines whether to ensure unique values in the output or not. If set to True, the function will make sure that the output values are unique, defaults to True
  • annotation_field: The annotation_field parameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function, defaults to ANN
  • annotation_id: The annotation_id parameter in the annotation_format_to_table method is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
  • view_name: The view_name parameter in the annotation_format_to_table method is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis, defaults to transcripts
Returns

The function annotation_format_to_table is returning the name of the view created, which is stored in the variable view_name.

def transcript_view_to_variants( self, transcripts_table: str = None, transcripts_column_id: str = None, transcripts_info_json: str = None, transcripts_info_field_json: str = None, transcripts_info_format: str = None, transcripts_info_field_format: str = None, param: dict = {}) -> bool:
10011    def transcript_view_to_variants(
10012        self,
10013        transcripts_table: str = None,
10014        transcripts_column_id: str = None,
10015        transcripts_info_json: str = None,
10016        transcripts_info_field_json: str = None,
10017        transcripts_info_format: str = None,
10018        transcripts_info_field_format: str = None,
10019        param: dict = {},
10020    ) -> bool:
10021        """
10022        The `transcript_view_to_variants` function updates a variants table with information from
10023        transcripts in JSON format.
10024
10025        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
10026        table containing the transcripts data. If this parameter is not provided, the function will
10027        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
10028        :type transcripts_table: str
10029        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
10030        column in the `transcripts_table` that contains the unique identifier for each transcript. This
10031        identifier is used to match transcripts with variants in the database
10032        :type transcripts_column_id: str
10033        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
10034        of the column in the variants table where the transcripts information will be stored in JSON
10035        format. This parameter allows you to define the column in the variants table that will hold the
10036        JSON-formatted information about transcripts
10037        :type transcripts_info_json: str
10038        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
10039        specify the field in the VCF header that will contain information about transcripts in JSON
10040        format. This field will be added to the VCF header as an INFO field with the specified name
10041        :type transcripts_info_field_json: str
10042        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
10043        format of the information about transcripts that will be stored in the variants table. This
10044        format can be used to define how the transcript information will be structured or displayed
10045        within the variants table
10046        :type transcripts_info_format: str
10047        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
10048        specify the field in the VCF header that will contain information about transcripts in a
10049        specific format. This field will be added to the VCF header as an INFO field with the specified
10050        name
10051        :type transcripts_info_field_format: str
10052        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
10053        that contains various configuration settings related to transcripts. It is used to provide
10054        default values for certain parameters if they are not explicitly provided when calling the
10055        method. The `param` dictionary can be passed as an argument
10056        :type param: dict
10057        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
10058        if the operation is successful and `False` if certain conditions are not met.
10059        """
10060
10061        msg_info_prefix = "Start transcripts view to variants annotations"
10062
10063        log.debug(f"{msg_info_prefix}...")
10064
10065        # Default
10066        transcripts_table_default = "transcripts"
10067        transcripts_column_id_default = "transcript"
10068        transcripts_info_json_default = None
10069        transcripts_info_format_default = None
10070        transcripts_info_field_json_default = None
10071        transcripts_info_field_format_default = None
10072
10073        # Param
10074        if not param:
10075            param = self.get_param()
10076
10077        # Transcripts table
10078        if transcripts_table is None:
10079            transcripts_table = param.get("transcripts", {}).get(
10080                "table", transcripts_table_default
10081            )
10082
10083        # Transcripts column ID
10084        if transcripts_column_id is None:
10085            transcripts_column_id = param.get("transcripts", {}).get(
10086                "column_id", transcripts_column_id_default
10087            )
10088
10089        # Transcripts info json
10090        if transcripts_info_json is None:
10091            transcripts_info_json = param.get("transcripts", {}).get(
10092                "transcripts_info_json", transcripts_info_json_default
10093            )
10094
10095        # Transcripts info field JSON
10096        if transcripts_info_field_json is None:
10097            transcripts_info_field_json = param.get("transcripts", {}).get(
10098                "transcripts_info_field_json", transcripts_info_field_json_default
10099            )
10100        # if transcripts_info_field_json is not None and transcripts_info_json is None:
10101        #     transcripts_info_json = transcripts_info_field_json
10102
10103        # Transcripts info format
10104        if transcripts_info_format is None:
10105            transcripts_info_format = param.get("transcripts", {}).get(
10106                "transcripts_info_format", transcripts_info_format_default
10107            )
10108
10109        # Transcripts info field FORMAT
10110        if transcripts_info_field_format is None:
10111            transcripts_info_field_format = param.get("transcripts", {}).get(
10112                "transcripts_info_field_format", transcripts_info_field_format_default
10113            )
10114        # if (
10115        #     transcripts_info_field_format is not None
10116        #     and transcripts_info_format is None
10117        # ):
10118        #     transcripts_info_format = transcripts_info_field_format
10119
10120        # Variants table
10121        table_variants = self.get_table_variants()
10122
10123        # Check info columns param
10124        if (
10125            transcripts_info_json is None
10126            and transcripts_info_field_json is None
10127            and transcripts_info_format is None
10128            and transcripts_info_field_format is None
10129        ):
10130            return False
10131
10132        # Transcripts infos columns
10133        query_transcripts_infos_columns = f"""
10134            SELECT *
10135            FROM (
10136                DESCRIBE SELECT * FROM {transcripts_table}
10137                )
10138            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
10139        """
10140        transcripts_infos_columns = list(
10141            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
10142        )
10143
10144        # View results
10145        clause_select = []
10146        clause_to_json = []
10147        clause_to_format = []
10148        for field in transcripts_infos_columns:
10149            clause_select.append(
10150                f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10151            )
10152            clause_to_json.append(f""" '{field}': "{field}" """)
10153            clause_to_format.append(f""" "{field}" """)
10154
10155        # Update
10156        update_set_json = []
10157        update_set_format = []
10158
10159        # VCF header
10160        vcf_reader = self.get_header()
10161
10162        # Transcripts to info column in JSON
10163        if transcripts_info_json is not None:
10164
10165            # Create column on variants table
10166            self.add_column(
10167                table_name=table_variants,
10168                column_name=transcripts_info_json,
10169                column_type="JSON",
10170                default_value=None,
10171                drop=False,
10172            )
10173
10174            # Add header
10175            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
10176                transcripts_info_json,
10177                ".",
10178                "String",
10179                "Transcripts in JSON format",
10180                "unknwon",
10181                "unknwon",
10182                self.code_type_map["String"],
10183            )
10184
10185            # Add to update
10186            update_set_json.append(
10187                f""" {transcripts_info_json}=t.{transcripts_info_json} """
10188            )
10189
10190        # Transcripts to info field in JSON
10191        if transcripts_info_field_json is not None:
10192
10193            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
10194
10195            # Add to update
10196            update_set_json.append(
10197                f""" 
10198                    INFO = concat(
10199                            CASE
10200                                WHEN INFO NOT IN ('', '.')
10201                                THEN INFO
10202                                ELSE ''
10203                            END,
10204                            CASE
10205                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
10206                                THEN concat(
10207                                    ';{transcripts_info_field_json}=',
10208                                    t.{transcripts_info_json}
10209                                )
10210                                ELSE ''
10211                            END
10212                            )
10213                """
10214            )
10215
10216            # Add header
10217            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
10218                transcripts_info_field_json,
10219                ".",
10220                "String",
10221                "Transcripts in JSON format",
10222                "unknwon",
10223                "unknwon",
10224                self.code_type_map["String"],
10225            )
10226
10227        if update_set_json:
10228
10229            # Update query
10230            query_update = f"""
10231                UPDATE {table_variants}
10232                    SET {", ".join(update_set_json)}
10233                FROM
10234                (
10235                    SELECT
10236                        "#CHROM", POS, REF, ALT,
10237                            concat(
10238                            '{{',
10239                            string_agg(
10240                                '"' || "{transcripts_column_id}" || '":' ||
10241                                to_json(json_output)
10242                            ),
10243                            '}}'
10244                            )::JSON AS {transcripts_info_json}
10245                    FROM
10246                        (
10247                        SELECT
10248                            "#CHROM", POS, REF, ALT,
10249                            "{transcripts_column_id}",
10250                            to_json(
10251                                {{{",".join(clause_to_json)}}}
10252                            )::JSON AS json_output
10253                        FROM
10254                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10255                        WHERE "{transcripts_column_id}" IS NOT NULL
10256                        )
10257                    GROUP BY "#CHROM", POS, REF, ALT
10258                ) AS t
10259                WHERE {table_variants}."#CHROM" = t."#CHROM"
10260                    AND {table_variants}."POS" = t."POS"
10261                    AND {table_variants}."REF" = t."REF"
10262                    AND {table_variants}."ALT" = t."ALT"
10263            """
10264
10265            self.execute_query(query=query_update)
10266
10267        # Transcripts to info column in FORMAT
10268        if transcripts_info_format is not None:
10269
10270            # Create column on variants table
10271            self.add_column(
10272                table_name=table_variants,
10273                column_name=transcripts_info_format,
10274                column_type="VARCHAR",
10275                default_value=None,
10276                drop=False,
10277            )
10278
10279            # Add header
10280            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
10281                transcripts_info_format,
10282                ".",
10283                "String",
10284                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10285                "unknwon",
10286                "unknwon",
10287                self.code_type_map["String"],
10288            )
10289
10290            # Add to update
10291            update_set_format.append(
10292                f""" {transcripts_info_format}=t.{transcripts_info_format} """
10293            )
10294
10295        # Transcripts to info field in JSON
10296        if transcripts_info_field_format is not None:
10297
10298            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
10299
10300            # Add to update
10301            update_set_format.append(
10302                f""" 
10303                    INFO = concat(
10304                            CASE
10305                                WHEN INFO NOT IN ('', '.')
10306                                THEN INFO
10307                                ELSE ''
10308                            END,
10309                            CASE
10310                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
10311                                THEN concat(
10312                                    ';{transcripts_info_field_format}=',
10313                                    t.{transcripts_info_format}
10314                                )
10315                                ELSE ''
10316                            END
10317                            )
10318                """
10319            )
10320
10321            # Add header
10322            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
10323                transcripts_info_field_format,
10324                ".",
10325                "String",
10326                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10327                "unknwon",
10328                "unknwon",
10329                self.code_type_map["String"],
10330            )
10331
10332        if update_set_format:
10333
10334            # Update query
10335            query_update = f"""
10336                UPDATE {table_variants}
10337                    SET {", ".join(update_set_format)}
10338                FROM
10339                (
10340                    SELECT
10341                        "#CHROM", POS, REF, ALT,
10342                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
10343                    FROM 
10344                        (
10345                        SELECT
10346                            "#CHROM", POS, REF, ALT,
10347                            "{transcripts_column_id}",
10348                            concat(
10349                                "{transcripts_column_id}",
10350                                '|',
10351                                {", '|', ".join(clause_to_format)}
10352                            ) AS {transcripts_info_format}
10353                        FROM
10354                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10355                        )
10356                    GROUP BY "#CHROM", POS, REF, ALT
10357                ) AS t
10358                WHERE {table_variants}."#CHROM" = t."#CHROM"
10359                    AND {table_variants}."POS" = t."POS"
10360                    AND {table_variants}."REF" = t."REF"
10361                    AND {table_variants}."ALT" = t."ALT"
10362            """
10363
10364            self.execute_query(query=query_update)
10365
10366        return True

The transcript_view_to_variants function updates a variants table with information from transcripts in JSON format.

Parameters
  • transcripts_table: The transcripts_table parameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from the param dictionary or use a default value of "transcripts"
  • transcripts_column_id: The transcripts_column_id parameter is used to specify the column in the transcripts_table that contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database
  • transcripts_info_json: The transcripts_info_json parameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts
  • transcripts_info_field_json: The transcripts_info_field_json parameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name
  • transcripts_info_format: The transcripts_info_format parameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table
  • transcripts_info_field_format: The transcripts_info_field_format parameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name
  • param: The param parameter in the transcript_view_to_variants method is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. The param dictionary can be passed as an argument
Returns

The function transcript_view_to_variants returns a boolean value. It returns True if the operation is successful and False if certain conditions are not met.